Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 11

Commit

d192dd4

verified ·

1 Parent(s): 0e7de0c

Update app.py

Browse files

Files changed (1) hide show

app.py +334 -346

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torch.nn as nn
 import matplotlib.pyplot as plt
 import io
 from PIL import Image
 ###############################################################################
 # Model Definition
@@ -30,30 +31,16 @@ class VirusClassifier(nn.Module):
     def forward(self, x):
         return self.network(x)
-    def get_feature_importance(self, x):
-        """
-        Calculate gradient-based feature importance, specifically for the
-        'human' class (index=1) by computing gradient of that probability wrt x.
-        """
-        x.requires_grad_(True)
-        output = self.network(x)
-        probs = torch.softmax(output, dim=1)
-        # Probability of 'human' class (index=1)
-        human_prob = probs[..., 1]
-        if x.grad is not None:
-            x.grad.zero_()
-        human_prob.backward()
-        importance = x.grad  # shape: (batch_size, n_features)
-        return importance, float(human_prob)
 ###############################################################################
 # Utility Functions
 ###############################################################################
 def parse_fasta(text):
-    """Parses text input in FASTA format into a list of (header, sequence)."""
     sequences = []
     current_header = None
     current_sequence = []
@@ -74,7 +61,10 @@ def parse_fasta(text):
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
-    """Convert a single nucleotide sequence to a k-mer frequency vector."""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
@@ -92,377 +82,375 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
 ###############################################################################
-# Visualization
 ###############################################################################
-def create_shap_waterfall_plot(important_kmers, all_kmer_importance, human_prob, title):
-    """
-    Create a SHAP-like waterfall plot:
-      - Start at baseline = 0.5
-      - Add a bar for "Other" which is the combined effect of all less-important k-mers
-      - Then apply each of the top k-mers in descending order of absolute importance
-      - Show final predicted human probability as the endpoint
     """
-    # 1) Sort 'important_kmers' by absolute impact descending
-    sorted_kmers = sorted(important_kmers, key=lambda x: x['impact'], reverse=True)
-    # 2) Compute the total effect of "other" k-mers
-    #    We have 256 total features. We selected top 10. Sum the rest.
-    top_ids = set([km['idx'] for km in sorted_kmers])
-    other_contributions = []
-    for i, val in enumerate(all_kmer_importance):
-        if i not in top_ids:
-            other_contributions.append(val)
-    # sum up those "other" contributions
-    other_sum = np.sum(other_contributions)
-    # The "impact" for "other" will be the absolute value, direction depends on sign
-    other_impact = float(abs(other_sum))
-    other_direction = "human" if other_sum > 0 else "non-human"
-    # 3) Build a list of all bars: first "other", then each top k-mer
-    # Each bar needs: name, raw_contribution_value
-    # We'll store (label, contribution). The sign indicates direction.
-    bars = []
-    bars.append(("Other", other_sum))  # lumps the leftover k-mers
-    for km in sorted_kmers:
-        # We re-inject the sign on the raw gradient
-        # (We stored only the absolute in "impact," so let's create a signed value)
-        signed_val = km['impact'] if km['direction'] == 'human' else -km['impact']
-        bars.append((km['kmer'], signed_val))
-    # 4) Waterfall plot data:
-    # We'll accumulate partial sums from baseline=0.5
-    baseline = 0.5
-    running_val = baseline
-    x_labels = []
-    y_vals = []
-    bar_colors = []
-    # We'll use green for positive contributions (pushing toward 'human'),
-    # red for negative contributions (pushing away from 'human')
-    for (label, contrib) in bars:
-        x_labels.append(label)
-        # new value after adding this contribution
-        new_val = running_val + (0.05 * contrib)
-        # ^ scaled by 0.05 for better display. Adjust as desired.
-        y_vals.append((running_val, new_val))
-        running_val = new_val
-        if contrib >= 0:
-            bar_colors.append("green")
-        else:
-            bar_colors.append("red")
-    final_prob = running_val
-    # Final point is the model's predicted probability (not always exact, but this is a shap-like idea).
-    # If we want to forcibly ensure final_prob = human_prob, we could do:
-    #   correction = human_prob - running_val
-    #   running_val += correction
-    # but for now let's keep the "waterfall" purely additive from the gradient.
-    # Let's plot:
-    fig, ax = plt.subplots(figsize=(10, 6))
-    # We'll create the bars manually
-    x_positions = np.arange(len(x_labels))
-    last_end = baseline
-    for i, ((start_val, end_val), color) in enumerate(zip(y_vals, bar_colors)):
-        # The bar's height is the difference
-        height = end_val - start_val
-        ax.bar(i, height, bottom=start_val, color=color, edgecolor='black', alpha=0.7)
-        ax.text(i, (start_val + end_val) / 2, f"{height:+.3f}", ha='center', va='center', color='white', fontsize=8)
-    ax.axhline(y=baseline, color='black', linestyle='--', linewidth=1)
-    ax.set_xticks(x_positions)
-    ax.set_xticklabels(x_labels, rotation=45, ha='right')
-    ax.set_ylim(0, 1)
-    ax.set_ylabel("Running Probability (Human)")
-    ax.set_title(f"SHAP-like Waterfall — Final Probability: {final_prob:.3f} (Model Probability: {human_prob:.3f})")
-    plt.tight_layout()
-    return fig
-def create_frequency_sigma_plot(important_kmers, title):
-    """Creates a bar plot of the top k-mers (by importance) showing frequency (%) and σ from mean."""
-    # Sort by absolute impact
-    sorted_kmers = sorted(important_kmers, key=lambda x: x['impact'], reverse=True)
-    kmers = [k["kmer"] for k in sorted_kmers]
-    frequencies = [k["occurrence"] for k in sorted_kmers]  # in %
-    sigmas = [k["sigma"] for k in sorted_kmers]
-    directions = [k["direction"] for k in sorted_kmers]
     x = np.arange(len(kmers))
     width = 0.4
-    fig, ax_bar = plt.subplots(figsize=(10, 6))
-    # Bar for frequency
-    bars_freq = ax_bar.bar(
-        x - width/2, frequencies, width, alpha=0.7,
-        color=["green" if d=="human" else "red" for d in directions],
-        label="Frequency (%)"
-    )
-    ax_bar.set_ylabel("Frequency (%)")
-    ax_bar.set_ylim(0, max(frequencies) * 1.2 if frequencies else 1)
-    # Twin axis for σ
-    ax_bar_twin = ax_bar.twinx()
-    bars_sigma = ax_bar_twin.bar(
-        x + width/2, sigmas, width, alpha=0.5, color="gray", label="σ from Mean"
-    )
-    ax_bar_twin.set_ylabel("Standard Deviations (σ)")
-    ax_bar.set_title(f"Frequency & σ from Mean for Top k-mers — {title}")
-    ax_bar.set_xticks(x)
-    ax_bar.set_xticklabels(kmers, rotation=45, ha='right')
-    # Combined legend
-    lines1, labels1 = ax_bar.get_legend_handles_labels()
-    lines2, labels2 = ax_bar_twin.get_legend_handles_labels()
-    ax_bar.legend(lines1 + lines2, labels1 + labels2, loc="upper right")
-    plt.tight_layout()
-    return fig
-def create_importance_bar_plot(important_kmers, title):
-    """
-    Create a simple bar chart showing the absolute gradient magnitude
-    for the top k-mers, sorted descending.
-    """
-    sorted_kmers = sorted(important_kmers, key=lambda x: x['impact'], reverse=True)
-    kmers = [k['kmer'] for k in sorted_kmers]
-    impacts = [k['impact'] for k in sorted_kmers]
-    directions = [k["direction"] for k in sorted_kmers]
-    x = np.arange(len(kmers))
-    fig, ax = plt.subplots(figsize=(10, 6))
-    bar_colors = ["green" if d=="human" else "red" for d in directions]
-    ax.bar(x, impacts, color=bar_colors, alpha=0.7)
     ax.set_xticks(x)
     ax.set_xticklabels(kmers, rotation=45, ha='right')
-    ax.set_title(f"Absolute Feature Importance (Top k-mers) — {title}")
-    ax.set_ylabel("Gradient Magnitude")
-    ax.grid(axis="y", alpha=0.3)
     plt.tight_layout()
     return fig
 ###############################################################################
-# Prediction Function
 ###############################################################################
-def predict(file_obj):
     """
-    Main function for Gradio:
-      1. Reads the uploaded FASTA file or text.
-      2. Loads the model and scaler.
-      3. Generates predictions, probabilities, and top k-mers.
-      4. Returns multiple outputs:
-         - A textual summary (Markdown).
-         - Waterfall plot.
-         - Frequency & sigma plot.
-         - Absolute importance bar plot.
     """
-    # 0. Basic file read
-    if file_obj is None:
-        return (
-            "Please upload a FASTA file.",
-            None,
-            None,
-            None
-        )
-    try:
-        # If user provided raw text, use that
-        if isinstance(file_obj, str):
-            text = file_obj
-        else:
-            # If user uploaded a file, decode it
-            text = file_obj.decode('utf-8')
-    except Exception as e:
-        return (
-            f"Error reading file: {str(e)}",
-            None,
-            None,
-            None
-        )
-    # 1. Parse FASTA
     sequences = parse_fasta(text)
     if len(sequences) == 0:
-        return (
-            "No valid FASTA sequences found. Please check your input.",
-            None,
-            None,
-            None
-        )
-    # We’ll just classify the first sequence for demonstration
-    header, seq = sequences[0]
-    # 2. Create k-mer vector & load model
     k = 4
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Prepare raw freq vector & scale
-        raw_freq_vector = sequence_to_kmer_vector(seq, k=k)
-        # Load model & scaler
         model = VirusClassifier(input_shape=4**k).to(device)
-        state_dict = torch.load('model.pt', map_location=device)
         model.load_state_dict(state_dict)
-        scaler = joblib.load('scaler.pkl')
         model.eval()
-        scaled_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
-        X_tensor = torch.FloatTensor(scaled_vector).to(device)
-        # 3. Inference
-        with torch.no_grad():
-            logits = model(X_tensor)
-            probs = torch.softmax(logits, dim=1)
-        human_prob = float(probs[0][1])
-        non_human_prob = float(probs[0][0])
-        pred_class = 1 if human_prob >= non_human_prob else 0
-        pred_label = "human" if pred_class == 1 else "non-human"
-        confidence = float(max(probs[0]))
-        # 4. Feature importance
-        importance, hum_prob_grad = model.get_feature_importance(X_tensor)
-        # shape: [1, 256]
-        kmer_importances = importance[0].cpu().numpy()
-        # We’ll store them as a dictionary: index -> (k-mer, importance)
-        # Build up a dict for k-mer strings
-        kmers_list = [''.join(p) for p in product("ACGT", repeat=k)]
-        kmer_dict = {km: i for i, km in enumerate(kmers_list)}
-        # 5. Get the top 10 k-mers by absolute importance
-        abs_importance = np.abs(kmer_importances)
-        top_k = 10
-        top_idxs = np.argsort(abs_importance)[-top_k:][::-1]  # descending
-        important_kmers = []
-        for idx in top_idxs:
-            # Find the k-mer by index
-            kmer_str = kmers_list[idx]
-            # direction
-            direction = "human" if kmer_importances[idx] > 0 else "non-human"
-            # frequency in % from raw_freq_vector
-            freq_percent = float(raw_freq_vector[idx] * 100)
-            # sigma from scaled vector
-            sigma_val = float(scaled_vector[0][idx])
-            important_kmers.append({
-                'kmer': kmer_str,
-                'idx': idx,
-                'impact': float(abs_importance[idx]),
-                'direction': direction,
-                'occurrence': freq_percent,
-                'sigma': sigma_val
-            })
-        # 6. Text Summary
-        summary_text = (
-            f"**Sequence Header**: {header}\n\n"
-            f"**Predicted Label**: {pred_label}\n"
-            f"**Confidence**: {confidence:.4f}\n\n"
-            f"**Human Probability**: {human_prob:.4f}\n"
-            f"**Non-human Probability**: {non_human_prob:.4f}\n\n"
-            "### Most Influential k-mers:\n"
-        )
-        for km in important_kmers:
-            direction_text = f"(pushes toward {km['direction']})"
-            freq_text = f"{km['occurrence']:.2f}%"
-            sigma_text = f"{abs(km['sigma']):.2f}σ " + ("above" if km['sigma']>0 else "below") + " mean"
-            summary_text += (
-                f"- **{km['kmer']}**: impact={km['impact']:.4f}, {direction_text}, "
-                f"occurrence={freq_text}, ({sigma_text})\n"
-            )
-        # 7. Plots
-        #   a) SHAP-like Waterfall Plot
-        fig_waterfall = create_shap_waterfall_plot(
-            important_kmers,
-            kmer_importances,
-            human_prob,
-            f"{header}"
-        )
-        buf1 = io.BytesIO()
-        fig_waterfall.savefig(buf1, format='png', bbox_inches='tight', dpi=120)
-        buf1.seek(0)
-        waterfall_img = Image.open(buf1)
-        plt.close(fig_waterfall)
-        #   b) Frequency & σ Plot (top 10 k-mers)
-        fig_freq_sigma = create_frequency_sigma_plot(
-            important_kmers,
-            f"{header}"
-        )
-        buf2 = io.BytesIO()
-        fig_freq_sigma.savefig(buf2, format='png', bbox_inches='tight', dpi=120)
-        buf2.seek(0)
-        freq_sigma_img = Image.open(buf2)
-        plt.close(fig_freq_sigma)
-        #   c) Absolute Importance Bar Plot
-        fig_imp = create_importance_bar_plot(
-            important_kmers,
-            f"{header}"
-        )
-        buf3 = io.BytesIO()
-        fig_imp.savefig(buf3, format='png', bbox_inches='tight', dpi=120)
-        buf3.seek(0)
-        importance_img = Image.open(buf3)
-        plt.close(fig_imp)
-        return summary_text, waterfall_img, freq_sigma_img, importance_img
-    except Exception as e:
-        return (
-            f"Error during prediction or visualization: {str(e)}",
-            None,
-            None,
-            None
         )
 ###############################################################################
 # Gradio Interface
 ###############################################################################
-with gr.Blocks(title="Advanced Virus Host Classifier") as demo:
     gr.Markdown(
         """
-        # Advanced Virus Host Classifier
-        **Upload a FASTA file** containing a single nucleotide sequence.
-        The model will predict whether this sequence is **human** or **non-human**,
-        provide a confidence score, and highlight the most influential k-mers
-        (using a SHAP-like waterfall plot) along with two additional plots.
         """
     )
-    with gr.Row():
-        file_in = gr.File(label="Upload FASTA", type="binary")
-        btn = gr.Button("Run Prediction")
-    # We will create multiple tabs for our outputs
     with gr.Tabs():
-        with gr.Tab("Prediction Results"):
             md_out = gr.Markdown()
-        with gr.Tab("SHAP-like Waterfall Plot"):
-            water_out = gr.Image()
-        with gr.Tab("Frequency & σ Plot"):
-            freq_out = gr.Image()
-        with gr.Tab("Importance Bar Plot"):
-            imp_out = gr.Image()
-    # Link the button
-    btn.click(
-        fn=predict,
-        inputs=[file_in],
-        outputs=[md_out, water_out, freq_out, imp_out]
     )
 if __name__ == "__main__":

 import matplotlib.pyplot as plt
 import io
 from PIL import Image
+import shap  # Requires: pip install shap
 ###############################################################################
 # Model Definition
     def forward(self, x):
         return self.network(x)
 ###############################################################################
 # Utility Functions
 ###############################################################################
 def parse_fasta(text):
+    """
+    Parses text input in FASTA format into a list of (header, sequence).
+    Handles multiple sequences if present.
+    """
     sequences = []
     current_header = None
     current_sequence = []
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
+    """
+    Convert a single nucleotide sequence to a k-mer frequency vector
+    of length 4^k (e.g., for k=4, length=256).
+    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
 ###############################################################################
+# Visualization Helpers
 ###############################################################################
+def create_freq_sigma_plot(
+    single_shap_values: np.ndarray,
+    raw_freq_vector: np.ndarray,
+    scaled_vector: np.ndarray,
+    kmer_list,
+    title: str
+):
     """
+    Creates a bar plot showing top-10 k-mers (by absolute SHAP value),
+    with frequency (%) and sigma from mean on a twin-axis.
+    single_shap_values: shape=(256,) shap values for this sample
+    raw_freq_vector: shape=(256,) original frequencies for this sample
+    scaled_vector: shape=(256,) scaled (Z-score) values for this sample
+    kmer_list: list of all k-mers (length=256)
+    """
+    abs_vals = np.abs(single_shap_values)
+    top_k = 10
+    top_indices = np.argsort(abs_vals)[-top_k:][::-1]  # top 10 by absolute shap
+    top_data = []
+    for idx in top_indices:
+        top_data.append({
+            "kmer": kmer_list[idx],
+            "shap": single_shap_values[idx],
+            "abs_shap": abs_vals[idx],
+            "frequency": raw_freq_vector[idx] * 100.0,  # percentage
+            "sigma": scaled_vector[idx]
+        })
+    # Sort top_data by abs_shap descending
+    top_data.sort(key=lambda x: x["abs_shap"], reverse=True)
+    kmers   = [d["kmer"] for d in top_data]
+    freqs   = [d["frequency"] for d in top_data]
+    sigmas  = [d["sigma"] for d in top_data]
+    # color by sign (positive=green, negative=red)
+    colors  = ["green" if d["shap"] >= 0 else "red" for d in top_data]
     x = np.arange(len(kmers))
     width = 0.4
+    fig, ax = plt.subplots(figsize=(8, 5))
+    # Frequency
+    ax.bar(x - width/2, freqs, width, color=colors, alpha=0.7, label="Frequency (%)")
+    ax.set_ylabel("Frequency (%)", color='black')
+    ax.set_ylim(0, max(freqs)*1.2 if len(freqs) else 1)
+    # Twin axis for sigma
+    ax2 = ax.twinx()
+    ax2.bar(x + width/2, sigmas, width, color="gray", alpha=0.5, label="σ from Mean")
+    ax2.set_ylabel("Standard Deviations (σ)", color='black')
     ax.set_xticks(x)
     ax.set_xticklabels(kmers, rotation=45, ha='right')
+    ax.set_title(f"Top-10 K-mers (Frequency & σ)\n{title}")
+    # Combine legends
+    lines1, labels1 = ax.get_legend_handles_labels()
+    lines2, labels2 = ax2.get_legend_handles_labels()
+    ax.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
     plt.tight_layout()
     return fig
 ###############################################################################
+# Main Inference & SHAP Logic
 ###############################################################################
+def run_classification_and_shap(file_obj):
     """
+    Reads one or more FASTA sequences from file_obj or text.
+    Returns:
+      - Table of results (list of dicts) for each sequence
+      - shap_values object (SHAP values for the entire batch)
+      - array/batch of scaled vectors (for use in the waterfall selection)
+      - list of k-mers (for indexing)
+      - possibly the model or other context
     """
+    # 1. Basic read
+    if isinstance(file_obj, str):
+        text = file_obj
+    else:
+        try:
+            text = file_obj.decode("utf-8")
+        except Exception as e:
+            return None, None, f"Error reading file: {str(e)}"
+    # 2. Parse FASTA
     sequences = parse_fasta(text)
     if len(sequences) == 0:
+        return None, None, "No valid FASTA sequences found!"
+    # 3. Convert each sequence to k-mer vector
     k = 4
+    all_raw_vectors = []
+    headers = []
+    seqs = []
+    for (hdr, seq) in sequences:
+        raw_vec = sequence_to_kmer_vector(seq, k=k)
+        all_raw_vectors.append(raw_vec)
+        headers.append(hdr)
+        seqs.append(seq)
+    all_raw_vectors = np.stack(all_raw_vectors, axis=0)  # shape=(num_seqs, 256)
+    # 4. Load model & scaler
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = VirusClassifier(input_shape=4**k).to(device)
+        state_dict = torch.load("model.pt", map_location=device)
         model.load_state_dict(state_dict)
         model.eval()
+        scaler = joblib.load("scaler.pkl")
+    except Exception as e:
+        return None, None, f"Error loading model or scaler: {str(e)}"
+    # 5. Scale data
+    scaled_data = scaler.transform(all_raw_vectors)  # shape=(num_seqs, 256)
+    # 6. Predictions
+    X_tensor = torch.FloatTensor(scaled_data).to(device)
+    with torch.no_grad():
+        logits = model(X_tensor)
+        probs = torch.softmax(logits, dim=1).cpu().numpy()
+    preds = np.argmax(probs, axis=1)  # 0 or 1
+    results_table = []
+    for i, (hdr, seq) in enumerate(zip(headers, seqs)):
+        results_table.append({
+            "header": hdr,
+            "sequence": seq[:50] + ("..." if len(seq)>50 else ""),  # truncated
+            "pred_label": "human" if preds[i] == 1 else "non-human",
+            "human_prob": float(probs[i][1]),
+            "non_human_prob": float(probs[i][0]),
+            "confidence": float(max(probs[i]))
+        })
+    # 7. SHAP Explainer
+    # We'll pick a background subset if there are many sequences
+    # (For performance, we might limit to e.g. 50 samples max)
+    if scaled_data.shape[0] > 50:
+        background_data = scaled_data[:50]
+    else:
+        background_data = scaled_data
+    # Use the "new" unified shap.Explainer approach
+    # We pass in a function that does the forward pass. Or pass the model directly.
+    # For PyTorch models, shap can do a direct 'model' approach with a mask.
+    # We'll do a simple "use shap.Explainer" with data=background_data
+    explainer = shap.Explainer(model, background_data)
+    shap_values = explainer(scaled_data)  # shape=(num_samples, num_features)
+    # k-mer list
+    kmer_list = [''.join(p) for p in product("ACGT", repeat=k)]
+    return (results_table, shap_values, scaled_data, kmer_list, None)
+###############################################################################
+# Gradio Callback Functions
+###############################################################################
+def main_predict(file_obj):
+    """
+    This function is triggered by the 'Run' button in Gradio.
+    It returns a markdown of all sequences/predictions and stores
+    data needed for the subsequent SHAP visualizations.
+    """
+    results, shap_vals, scaled_data, kmer_list, err = run_classification_and_shap(file_obj)
+    if err:
+        return (err, None, None, None, None)
+    if results is None or shap_vals is None:
+        return ("An unknown error occurred.", None, None, None, None)
+    # Build a summary for all sequences
+    md = "# Classification Results\n\n"
+    md += "| # | Header | Pred Label | Confidence | Human Prob | Non-human Prob |\n"
+    md += "|---|--------|------------|------------|------------|----------------|\n"
+    for i, row in enumerate(results):
+        md += (
+            f"| {i} | {row['header']} | {row['pred_label']} | "
+            f"{row['confidence']:.4f} | {row['human_prob']:.4f} | {row['non_human_prob']:.4f} |\n"
         )
+    md += "\nSelect a sequence index below to view SHAP Waterfall & Frequency plots."
+    # Return the string, and also the shap values plus data needed
+    # We'll store these to SessionState via Gradio's "State" or we can
+    # pass them out as hidden fields.
+    return (md, shap_vals, scaled_data, kmer_list, results)
+def update_waterfall_plot(selected_index, shap_values_obj):
+    """
+    Build a waterfall plot for the user-selected sample.
+    """
+    if shap_values_obj is None:
+        return None
+    try:
+        selected_index = int(selected_index)
+    except:
+        selected_index = 0
+    # We'll create the figure by calling shap.plots.waterfall
+    # Convert shap_values_obj to the new shap interface
+    # shap_values_obj is a shap._explanation.Explanation typically
+    # We can create a figure with shap.plots.waterfall and capture it as an image
+    shap_plots_fig = plt.figure(figsize=(8, 5))
+    shap.plots.waterfall(shap_values_obj[selected_index], max_display=14,
+                         show=False)  # show=False so it doesn't pop in the notebook
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight', dpi=120)
+    buf.seek(0)
+    wf_img = Image.open(buf)
+    plt.close(shap_plots_fig)
+    return wf_img
+def update_beeswarm_plot(shap_values_obj):
+    """
+    Build a beeswarm plot across all samples.
+    """
+    if shap_values_obj is None:
+        return None
+    beeswarm_fig = plt.figure(figsize=(8, 5))
+    shap.plots.beeswarm(shap_values_obj, show=False)
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight', dpi=120)
+    buf.seek(0)
+    bs_img = Image.open(buf)
+    plt.close(beeswarm_fig)
+    return bs_img
+def update_freq_plot(selected_index, shap_values_obj, scaled_data, kmer_list, file_obj):
+    """
+    Create the frequency & sigma bar chart for the selected sequence's top-10 k-mers.
+    We'll need to also compute the raw_freq_vector from the original unscaled data.
+    """
+    if shap_values_obj is None or scaled_data is None or kmer_list is None:
+        return None
+    try:
+        selected_index = int(selected_index)
+    except:
+        selected_index = 0
+    # We must re-generate the raw freq vector from the original input file
+    # or store it from earlier. Let's just re-run parse for that single sequence:
+    # But simpler is: run_classification_and_shap was storing all_raw_vectors...
+    # Let's do a quick approach: run_classification_and_shap already computed it
+    # but we didn't store it. We'll re-run the parse logic to get the raw freq again.
+    # For memory / speed reasons, better is to store it.
+    # For simplicity, let's parse again quickly:
+    if isinstance(file_obj, str):
+        text = file_obj
+    else:
+        text = file_obj.decode('utf-8')
+    sequences = parse_fasta(text)
+    # the selected_index might be out of range, so let's clamp it
+    if selected_index >= len(sequences):
+        selected_index = 0
+    seq = sequences[selected_index][1]  # get the sequence
+    raw_vec = sequence_to_kmer_vector(seq, k=4)
+    single_shap_values = shap_values_obj.values[selected_index]
+    freq_sigma_fig = create_freq_sigma_plot(
+        single_shap_values,
+        raw_freq_vector=raw_vec,
+        scaled_vector=scaled_data[selected_index],
+        kmer_list=kmer_list,
+        title=f"Sample #{selected_index} — {sequences[selected_index][0]}"
+    )
+    buf = io.BytesIO()
+    freq_sigma_fig.savefig(buf, format='png', bbox_inches='tight', dpi=120)
+    buf.seek(0)
+    fs_img = Image.open(buf)
+    plt.close(freq_sigma_fig)
+    return fs_img
 ###############################################################################
 # Gradio Interface
 ###############################################################################
+with gr.Blocks(title="Multi-Sequence Virus Host Classifier with SHAP") as demo:
+    shap.initjs()  # load shap JS for interactive plots in some contexts (optional)
     gr.Markdown(
         """
+        # **Advanced Virus Host Classifier with SHAP**
+        **Upload a FASTA file** with one or more nucleotide sequences.
+        This app will:
+        1. Predict each sequence's **host** (human vs. non-human).
+        2. Provide **SHAP** explanations (waterfall & beeswarm).
+        3. Let you explore **frequency & σ** for top-10 k-mers for a chosen sequence.
         """
     )
+    with gr.Row():
+        file_input = gr.File(label="Upload FASTA", type="binary")
+        run_btn = gr.Button("Run Classification")
+    # Store intermediate results in "States" for usage in subsequent tabs
+    shap_values_state = gr.State()
+    scaled_data_state = gr.State()
+    kmer_list_state = gr.State()
+    results_state = gr.State()
+    # We'll also store the "raw input" so we can reconstruct freq data for each sample
+    file_data_state = gr.State()
+    # TABS for outputs
     with gr.Tabs():
+        with gr.Tab("Results Table"):
             md_out = gr.Markdown()
+        with gr.Tab("SHAP Waterfall"):
+            # We'll let user pick the sequence index from a dropdown or slider
+            with gr.Row():
+                seq_index_dropdown = gr.Number(label="Sequence Index (0-based)", value=0, precision=0)
+                update_wf_btn = gr.Button("Update Waterfall")
+            wf_plot = gr.Image(label="SHAP Waterfall Plot")
+        with gr.Tab("SHAP Beeswarm"):
+            bs_plot = gr.Image(label="Global Beeswarm Plot", height=500)
+        with gr.Tab("Top-10 Frequency & Sigma"):
+            with gr.Row():
+                seq_index_dropdown2 = gr.Number(label="Sequence Index (0-based)", value=0, precision=0)
+                update_fs_btn = gr.Button("Update Frequency Chart")
+            fs_plot = gr.Image(label="Top-10 Frequency & σ Chart")
+    # --- Button Logic ---
+    run_btn.click(
+        fn=main_predict,
+        inputs=[file_input],
+        outputs=[md_out, shap_values_state, scaled_data_state, kmer_list_state, results_state]
+    )
+    run_btn.click(  # Also store the raw file data for later freq plots
+        fn=lambda x: x,
+        inputs=file_input,
+        outputs=file_data_state
+    )
+    update_wf_btn.click(
+        fn=update_waterfall_plot,
+        inputs=[seq_index_dropdown, shap_values_state],
+        outputs=[wf_plot]
+    )
+    update_fs_btn.click(
+        fn=update_freq_plot,
+        inputs=[seq_index_dropdown2, shap_values_state, scaled_data_state, kmer_list_state, file_data_state],
+        outputs=[fs_plot]
+    )
+    # We can auto-generate the beeswarm right after classification as well
+    run_btn.click(
+        fn=update_beeswarm_plot,
+        inputs=[shap_values_state],
+        outputs=[bs_plot]
     )
 if __name__ == "__main__":