Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 11

Commit

0d2d632

verified ·

1 Parent(s): 44f2fb5

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -46

app.py CHANGED Viewed

@@ -32,6 +32,28 @@ class VirusClassifier(nn.Module):
     def forward(self, x):
         return self.network(x)
 ###############################################################################
 # Utility Functions
@@ -65,6 +87,7 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     Convert a single nucleotide sequence to a k-mer frequency vector
     of length 4^k (e.g., for k=4, length=256).
     """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
@@ -122,6 +145,7 @@ def create_freq_sigma_plot(
     # color by sign (positive=green, negative=red)
     colors  = ["green" if d["shap"] >= 0 else "red" for d in top_data]
     x = np.arange(len(kmers))
     width = 0.4
@@ -129,7 +153,8 @@ def create_freq_sigma_plot(
     # Frequency
     ax.bar(x - width/2, freqs, width, color=colors, alpha=0.7, label="Frequency (%)")
     ax.set_ylabel("Frequency (%)", color='black')
-    ax.set_ylim(0, max(freqs)*1.2 if len(freqs) else 1)
     # Twin axis for sigma
     ax2 = ax.twinx()
@@ -160,7 +185,7 @@ def run_classification_and_shap(file_obj):
       - shap_values object (SHAP values for the entire batch)
       - array/batch of scaled vectors (for use in the waterfall selection)
       - list of k-mers (for indexing)
-      - possibly the model or other context
     """
     # 1. Basic read
     if isinstance(file_obj, str):
@@ -192,12 +217,15 @@ def run_classification_and_shap(file_obj):
     # 4. Load model & scaler
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = VirusClassifier(input_shape=4**k).to(device)
-        state_dict = torch.load("model.pt", map_location=device)
         model.load_state_dict(state_dict)
         model.eval()
         scaler = joblib.load("scaler.pkl")
     except Exception as e:
         return None, None, f"Error loading model or scaler: {str(e)}"
@@ -224,20 +252,18 @@ def run_classification_and_shap(file_obj):
     # 7. SHAP Explainer
     # We'll pick a background subset if there are many sequences
-    # (For performance, we might limit to e.g. 50 samples max)
     if scaled_data.shape[0] > 50:
         background_data = scaled_data[:50]
     else:
         background_data = scaled_data
-    # Use the "new" unified shap.Explainer approach
-    # We pass in a function that does the forward pass. Or pass the model directly.
-    # For PyTorch models, shap can do a direct 'model' approach with a mask.
-    # We'll do a simple "use shap.Explainer" with data=background_data
-    explainer = shap.Explainer(model, background_data)
     shap_values = explainer(scaled_data)  # shape=(num_samples, num_features)
     # k-mer list
     kmer_list = [''.join(p) for p in product("ACGT", repeat=k)]
     return (results_table, shap_values, scaled_data, kmer_list, None)
@@ -249,8 +275,8 @@ def run_classification_and_shap(file_obj):
 def main_predict(file_obj):
     """
     This function is triggered by the 'Run' button in Gradio.
-    It returns a markdown of all sequences/predictions and stores
-    data needed for the subsequent SHAP visualizations.
     """
     results, shap_vals, scaled_data, kmer_list, err = run_classification_and_shap(file_obj)
     if err:
@@ -270,32 +296,26 @@ def main_predict(file_obj):
         )
     md += "\nSelect a sequence index below to view SHAP Waterfall & Frequency plots."
-    # Return the string, and also the shap values plus data needed
-    # We'll store these to SessionState via Gradio's "State" or we can
-    # pass them out as hidden fields.
     return (md, shap_vals, scaled_data, kmer_list, results)
 def update_waterfall_plot(selected_index, shap_values_obj):
     """
-    Build a waterfall plot for the user-selected sample.
     """
     if shap_values_obj is None:
         return None
     try:
         selected_index = int(selected_index)
     except:
         selected_index = 0
-    # We'll create the figure by calling shap.plots.waterfall
-    # Convert shap_values_obj to the new shap interface
-    # shap_values_obj is a shap._explanation.Explanation typically
-    # We can create a figure with shap.plots.waterfall and capture it as an image
     shap_plots_fig = plt.figure(figsize=(8, 5))
-    shap.plots.waterfall(shap_values_obj[selected_index], max_display=14,
-                         show=False)  # show=False so it doesn't pop in the notebook
     buf = io.BytesIO()
     plt.savefig(buf, format='png', bbox_inches='tight', dpi=120)
     buf.seek(0)
@@ -304,7 +324,6 @@ def update_waterfall_plot(selected_index, shap_values_obj):
     return wf_img
 def update_beeswarm_plot(shap_values_obj):
     """
     Build a beeswarm plot across all samples.
@@ -312,6 +331,9 @@ def update_beeswarm_plot(shap_values_obj):
     if shap_values_obj is None:
         return None
     beeswarm_fig = plt.figure(figsize=(8, 5))
     shap.plots.beeswarm(shap_values_obj, show=False)
     buf = io.BytesIO()
@@ -322,11 +344,10 @@ def update_beeswarm_plot(shap_values_obj):
     return bs_img
 def update_freq_plot(selected_index, shap_values_obj, scaled_data, kmer_list, file_obj):
     """
     Create the frequency & sigma bar chart for the selected sequence's top-10 k-mers.
-    We'll need to also compute the raw_freq_vector from the original unscaled data.
     """
     if shap_values_obj is None or scaled_data is None or kmer_list is None:
         return None
@@ -336,23 +357,17 @@ def update_freq_plot(selected_index, shap_values_obj, scaled_data, kmer_list, fi
     except:
         selected_index = 0
-    # We must re-generate the raw freq vector from the original input file
-    # or store it from earlier. Let's just re-run parse for that single sequence:
-    # But simpler is: run_classification_and_shap was storing all_raw_vectors...
-    # Let's do a quick approach: run_classification_and_shap already computed it
-    # but we didn't store it. We'll re-run the parse logic to get the raw freq again.
-    # For memory / speed reasons, better is to store it.
-    # For simplicity, let's parse again quickly:
     if isinstance(file_obj, str):
         text = file_obj
     else:
         text = file_obj.decode('utf-8')
     sequences = parse_fasta(text)
-    # the selected_index might be out of range, so let's clamp it
     if selected_index >= len(sequences):
         selected_index = 0
-    seq = sequences[selected_index][1]  # get the sequence
     raw_vec = sequence_to_kmer_vector(seq, k=4)
     single_shap_values = shap_values_obj.values[selected_index]
@@ -376,11 +391,11 @@ def update_freq_plot(selected_index, shap_values_obj, scaled_data, kmer_list, fi
 # Gradio Interface
 ###############################################################################
 with gr.Blocks(title="Multi-Sequence Virus Host Classifier with SHAP") as demo:
-    shap.initjs()  # load shap JS for interactive plots in some contexts (optional)
     gr.Markdown(
         """
-        # **Advanced Virus Host Classifier with SHAP**
         **Upload a FASTA file** with one or more nucleotide sequences.
         This app will:
         1. Predict each sequence's **host** (human vs. non-human).
@@ -407,7 +422,7 @@ with gr.Blocks(title="Multi-Sequence Virus Host Classifier with SHAP") as demo:
             md_out = gr.Markdown()
         with gr.Tab("SHAP Waterfall"):
-            # We'll let user pick the sequence index from a dropdown or slider
             with gr.Row():
                 seq_index_dropdown = gr.Number(label="Sequence Index (0-based)", value=0, precision=0)
                 update_wf_btn = gr.Button("Update Waterfall")
@@ -424,34 +439,39 @@ with gr.Blocks(title="Multi-Sequence Virus Host Classifier with SHAP") as demo:
             fs_plot = gr.Image(label="Top-10 Frequency & σ Chart")
     # --- Button Logic ---
     run_btn.click(
         fn=main_predict,
         inputs=[file_input],
         outputs=[md_out, shap_values_state, scaled_data_state, kmer_list_state, results_state]
     )
-    run_btn.click(  # Also store the raw file data for later freq plots
         fn=lambda x: x,
         inputs=file_input,
         outputs=file_data_state
     )
     update_wf_btn.click(
         fn=update_waterfall_plot,
         inputs=[seq_index_dropdown, shap_values_state],
         outputs=[wf_plot]
     )
-    update_fs_btn.click(
-        fn=update_freq_plot,
-        inputs=[seq_index_dropdown2, shap_values_state, scaled_data_state, kmer_list_state, file_data_state],
-        outputs=[fs_plot]
-    )
-    # We can auto-generate the beeswarm right after classification as well
     run_btn.click(
         fn=update_beeswarm_plot,
         inputs=[shap_values_state],
         outputs=[bs_plot]
     )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

     def forward(self, x):
         return self.network(x)
+###############################################################################
+# Torch Model Wrapper for SHAP
+###############################################################################
+class TorchModelWrapper:
+    """
+    A simple callable that takes a PyTorch model and device,
+    and allows SHAP to pass in numpy arrays, which we convert to torch tensors.
+    """
+    def __init__(self, model: nn.Module, device='cpu'):
+        self.model = model
+        self.device = device
+    def __call__(self, x_np: np.ndarray):
+        """
+        x_np: shape=(batch_size, num_features) as a numpy array
+        Returns: numpy array of shape=(batch_size, num_outputs)
+        """
+        x_torch = torch.from_numpy(x_np).float().to(self.device)
+        with torch.no_grad():
+            out = self.model(x_torch).cpu().numpy()
+        return out
 ###############################################################################
 # Utility Functions
     Convert a single nucleotide sequence to a k-mer frequency vector
     of length 4^k (e.g., for k=4, length=256).
     """
+    from itertools import product
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
     # color by sign (positive=green, negative=red)
     colors  = ["green" if d["shap"] >= 0 else "red" for d in top_data]
+    import matplotlib.pyplot as plt
     x = np.arange(len(kmers))
     width = 0.4
     # Frequency
     ax.bar(x - width/2, freqs, width, color=colors, alpha=0.7, label="Frequency (%)")
     ax.set_ylabel("Frequency (%)", color='black')
+    if freqs:
+        ax.set_ylim(0, max(freqs)*1.2)
     # Twin axis for sigma
     ax2 = ax.twinx()
       - shap_values object (SHAP values for the entire batch)
       - array/batch of scaled vectors (for use in the waterfall selection)
       - list of k-mers (for indexing)
+      - error message or None
     """
     # 1. Basic read
     if isinstance(file_obj, str):
     # 4. Load model & scaler
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = VirusClassifier(input_shape=4**k).to(device)
+        # Set weights_only=True to suppress the future pickle warning
+        state_dict = torch.load("model.pt", map_location=device, weights_only=True)
         model.load_state_dict(state_dict)
         model.eval()
         scaler = joblib.load("scaler.pkl")
     except Exception as e:
         return None, None, f"Error loading model or scaler: {str(e)}"
     # 7. SHAP Explainer
     # We'll pick a background subset if there are many sequences
     if scaled_data.shape[0] > 50:
         background_data = scaled_data[:50]
     else:
         background_data = scaled_data
+    # Wrap the model so it can handle numpy -> tensor
+    wrapped_model = TorchModelWrapper(model, device)
+    explainer = shap.Explainer(wrapped_model, background_data)
     shap_values = explainer(scaled_data)  # shape=(num_samples, num_features)
     # k-mer list
+    from itertools import product
     kmer_list = [''.join(p) for p in product("ACGT", repeat=k)]
     return (results_table, shap_values, scaled_data, kmer_list, None)
 def main_predict(file_obj):
     """
     This function is triggered by the 'Run' button in Gradio.
+    It returns a markdown of all sequences/predictions and
+    the shap values plus data needed for subsequent plots.
     """
     results, shap_vals, scaled_data, kmer_list, err = run_classification_and_shap(file_obj)
     if err:
         )
     md += "\nSelect a sequence index below to view SHAP Waterfall & Frequency plots."
     return (md, shap_vals, scaled_data, kmer_list, results)
 def update_waterfall_plot(selected_index, shap_values_obj):
     """
+    Build a waterfall plot for the user-selected sample using shap.plots.waterfall.
     """
     if shap_values_obj is None:
         return None
+    import matplotlib.pyplot as plt
+    import shap
     try:
         selected_index = int(selected_index)
     except:
         selected_index = 0
+    # Create the figure by calling shap.plots.waterfall
     shap_plots_fig = plt.figure(figsize=(8, 5))
+    shap.plots.waterfall(shap_values_obj[selected_index], max_display=14, show=False)
     buf = io.BytesIO()
     plt.savefig(buf, format='png', bbox_inches='tight', dpi=120)
     buf.seek(0)
     return wf_img
 def update_beeswarm_plot(shap_values_obj):
     """
     Build a beeswarm plot across all samples.
     if shap_values_obj is None:
         return None
+    import matplotlib.pyplot as plt
+    import shap
     beeswarm_fig = plt.figure(figsize=(8, 5))
     shap.plots.beeswarm(shap_values_obj, show=False)
     buf = io.BytesIO()
     return bs_img
 def update_freq_plot(selected_index, shap_values_obj, scaled_data, kmer_list, file_obj):
     """
     Create the frequency & sigma bar chart for the selected sequence's top-10 k-mers.
+    We must re-parse the raw freq vector for that sequence, or store it from earlier.
     """
     if shap_values_obj is None or scaled_data is None or kmer_list is None:
         return None
     except:
         selected_index = 0
+    # Re-parse the FASTA to get the corresponding sequence
     if isinstance(file_obj, str):
         text = file_obj
     else:
         text = file_obj.decode('utf-8')
     sequences = parse_fasta(text)
     if selected_index >= len(sequences):
         selected_index = 0
+    seq = sequences[selected_index][1]
     raw_vec = sequence_to_kmer_vector(seq, k=4)
     single_shap_values = shap_values_obj.values[selected_index]
 # Gradio Interface
 ###############################################################################
 with gr.Blocks(title="Multi-Sequence Virus Host Classifier with SHAP") as demo:
+    shap.initjs()  # load shap JS if needed for interactive HTML (optional)
     gr.Markdown(
         """
+        # **Virus Host Classifier with SHAP**
         **Upload a FASTA file** with one or more nucleotide sequences.
         This app will:
         1. Predict each sequence's **host** (human vs. non-human).
             md_out = gr.Markdown()
         with gr.Tab("SHAP Waterfall"):
+            # We'll let user pick the sequence index from a dropdown or input
             with gr.Row():
                 seq_index_dropdown = gr.Number(label="Sequence Index (0-based)", value=0, precision=0)
                 update_wf_btn = gr.Button("Update Waterfall")
             fs_plot = gr.Image(label="Top-10 Frequency & σ Chart")
     # --- Button Logic ---
+    # 1) The main classification run
     run_btn.click(
         fn=main_predict,
         inputs=[file_input],
         outputs=[md_out, shap_values_state, scaled_data_state, kmer_list_state, results_state]
     )
+    # Also store raw file data for subsequent freq usage
+    run_btn.click(
         fn=lambda x: x,
         inputs=file_input,
         outputs=file_data_state
     )
+    # 2) Waterfall update
     update_wf_btn.click(
         fn=update_waterfall_plot,
         inputs=[seq_index_dropdown, shap_values_state],
         outputs=[wf_plot]
     )
+    # 3) Beeswarm update
     run_btn.click(
         fn=update_beeswarm_plot,
         inputs=[shap_values_state],
         outputs=[bs_plot]
     )
+    # 4) Frequency top-10 update
+    update_fs_btn.click(
+        fn=update_freq_plot,
+        inputs=[seq_index_dropdown2, shap_values_state, scaled_data_state, kmer_list_state, file_data_state],
+        outputs=[fs_plot]
+    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)