Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

910c6c2

verified ·

1 Parent(s): 03f2bb5

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -31

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import numpy as np
 from itertools import product
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import io
 from PIL import Image
@@ -144,20 +145,17 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     """
     n = len(shap_means)
     if n == 0:
-        # Edge case: empty array
         return (0, 0, 0.0)
     if window_size >= n:
-        # If the window is bigger than the entire sequence, return entire seq
         avg_val = float(np.mean(shap_means))
         return (0, n, avg_val)
-    # We'll build csum as length n+1 so csum[i] = sum of shap_means[:i]
-    # That means sum in [start, start+window_size) = csum[start+window_size] - csum[start].
     csum = np.zeros(n + 1, dtype=np.float32)
     csum[1:] = np.cumsum(shap_means)
     best_start = 0
-    # Initialize with the first window: [0, window_size)
     best_sum = csum[window_size] - csum[0]
     best_avg = best_sum / window_size
@@ -188,29 +186,65 @@ def fig_to_image(fig):
     plt.close(fig)
     return img
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     """
-    Plots a 1D heatmap of per-base SHAP contributions.
-    Negative = push toward Non-Human, Positive = push toward Human.
-    Optionally can show only a subrange (start:end).
-    Adjust layout so the colorbar is well below the x-axis:
-      - orientation='horizontal', pad=0.35
-      - plt.subplots_adjust(bottom=0.4)
     """
     if start is not None and end is not None:
-        shap_means = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     else:
         subtitle = ""
-    heatmap_data = shap_means.reshape(1, -1)  # shape (1, region_length)
     fig, ax = plt.subplots(figsize=(12, 2))
-    cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
-    # Place colorbar below and add extra margin
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.35)
-    cbar.set_label('SHAP Contribution')
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence')
@@ -231,7 +265,8 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
-    colors = ['#ff9999' if v > 0 else '#99ccff' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
@@ -244,7 +279,6 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     """
     Simple histogram of SHAP values in the subregion.
-    Helps see how many positions push human vs non-human.
     """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
@@ -294,12 +328,11 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
-        # Use weights_only=True to address the FutureWarning about untrusted pickle data
         state_dict = torch.load('model.pt', map_location=device, weights_only=True)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
-        # Load scaler (warning if version mismatch)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
         return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
@@ -353,7 +386,6 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
         "shap_means": shap_means
     }
-    # Return exactly 5 items
     return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
@@ -438,9 +470,11 @@ css = """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
-    # Virus Host Classifier (with Interactive Region Viewer)
     **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
     **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
@@ -477,12 +511,12 @@ with gr.Blocks(css=css) as iface:
                     label="Classification Results", lines=12, interactive=False
                 )
                 kmer_img = gr.Image(label="Top k-mer SHAP")
-                genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
         seq_state = gr.State()
         header_state = gr.State()
-        # analyze_sequence(...) returns 5 items.
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
@@ -492,7 +526,8 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
-        Select start/end positions to view local SHAP signals, distribution, and GC content.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
@@ -505,7 +540,7 @@ with gr.Blocks(css=css) as iface:
             interactive=False
         )
         with gr.Row():
-            subregion_img = gr.Image(label="Subregion SHAP Heatmap")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
         region_btn.click(
@@ -517,10 +552,15 @@ with gr.Blocks(css=css) as iface:
     gr.Markdown("""
     ### Interface Features
     - **Overall Classification** (human vs non-human) using k-mer frequencies.
-    - **Top k-mer SHAP**: which k-mers push the classifier output.
-    - **Genome-Wide SHAP Heatmap**: each base's average SHAP across overlapping k-mers.
-    - **Identify Subregions** (sliding window) with the strongest push for human or non-human.
-    - **Subregion Exploration**: local SHAP heatmap & histogram, GC content, fraction of positions pushing human vs. non-human.
     """)
 if __name__ == "__main__":

 from itertools import product
 import torch.nn as nn
 import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
 import io
 from PIL import Image
     """
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
     if window_size >= n:
+        # entire sequence
         avg_val = float(np.mean(shap_means))
         return (0, n, avg_val)
+    # We'll build csum of length n+1
     csum = np.zeros(n + 1, dtype=np.float32)
     csum[1:] = np.cumsum(shap_means)
     best_start = 0
     best_sum = csum[window_size] - csum[0]
     best_avg = best_sum / window_size
     plt.close(fig)
     return img
+def get_zero_centered_cmap():
+    """
+    Creates a custom diverging colormap that is:
+    - Blue for negative
+    - White for zero
+    - Red for positive
+    """
+    colors = [
+        (0.0, 'blue'),   # negative
+        (0.5, 'white'),  # zero
+        (1.0, 'red')     # positive
+    ]
+    cmap = mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
+    return cmap
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     """
+    Plots a 1D heatmap of per-base SHAP contributions with a custom colormap:
+    - Negative = blue
+    - 0 = white
+    - Positive = red
+    We'll force the range to be symmetrical around 0 by using:
+      vmin=-extent, vmax=+extent
+    so 0 is in the middle.
     """
     if start is not None and end is not None:
+        local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     else:
+        local_shap = shap_means
         subtitle = ""
+    if len(local_shap) == 0:
+        # Edge case: no data to plot
+        local_shap = np.array([0.0])
+    # Build 2D array for imshow
+    heatmap_data = local_shap.reshape(1, -1)
+    # Force symmetrical range
+    min_val = np.min(local_shap)
+    max_val = np.max(local_shap)
+    extent = max(abs(min_val), abs(max_val))
+    # Create custom colormap
+    custom_cmap = get_zero_centered_cmap()
     fig, ax = plt.subplots(figsize=(12, 2))
+    cax = ax.imshow(
+        heatmap_data,
+        aspect='auto',
+        cmap=custom_cmap,
+        vmin=-extent,
+        vmax=+extent
+    )
+    # Place colorbar below with plenty of margin
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.35)
+    cbar.set_label('SHAP Contribution (negative=blue, zero=white, positive=red)')
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence')
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
+    # negative -> blue, positive -> red
+    colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     """
     Simple histogram of SHAP values in the subregion.
     """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
     # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
+        # Use weights_only=True for safer loading
         state_dict = torch.load('model.pt', map_location=device, weights_only=True)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
         return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
         "shap_means": shap_means
     }
     return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
+    # Virus Host Classifier with White-Centered Gradient
     **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
     **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
+    **Color Scale**: Negative SHAP = Blue, Zero = White, Positive = Red.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
                     label="Classification Results", lines=12, interactive=False
                 )
                 kmer_img = gr.Image(label="Top k-mer SHAP")
+                genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
         seq_state = gr.State()
         header_state = gr.State()
+        # analyze_sequence(...) returns 5 items
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
+        Select start/end positions to view local SHAP signals, distribution, and GC content.
+        The heatmap also uses the same Blue-White-Red scale.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
             interactive=False
         )
         with gr.Row():
+            subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
         region_btn.click(
     gr.Markdown("""
     ### Interface Features
     - **Overall Classification** (human vs non-human) using k-mer frequencies.
+    - **SHAP Analysis** to see which k-mers push classification toward or away from human.
+    - **White-Centered SHAP Gradient**:
+      - Negative (blue), 0 (white), Positive (red), with symmetrical color range around 0.
+    - **Identify Subregions** with the strongest push for human or non-human.
+    - **Subregion Exploration**:
+      - Local SHAP heatmap & histogram
+      - GC content
+      - Fraction of positions pushing human vs. non-human
+      - Simple logic-based classification
     """)
 if __name__ == "__main__":