Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

d76e76a

verified ·

1 Parent(s): 962ae70

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -125

app.py CHANGED Viewed

@@ -96,20 +96,19 @@ def calculate_shap_values(model, x_tensor):
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
-            orig_value = x_zeroed[0, i].item()
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
             prob = probs[0, 1].item()
-            impact = baseline_prob - prob  # how much removing the feature changed the prediction
             shap_values.append(impact)
-            x_zeroed[0, i] = orig_value  # restore the original value
     return np.array(shap_values), baseline_prob
 ###############################################################################
-# 4. PER-BASE SHAP AGGREGATION (LINEAR HEATMAP)
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
@@ -122,60 +121,98 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     seq_len = len(sequence)
-    # Arrays to accumulate sums (SHAP) and coverage counts
     shap_sums = np.zeros(seq_len, dtype=np.float32)
     coverage = np.zeros(seq_len, dtype=np.float32)
-    # Slide over the sequence, summing SHAP values for overlapping positions
     for i in range(seq_len - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
-            # Get the SHAP value for this k-mer
-            value = shap_values[kmer_dict[kmer]]
-            # Accumulate it for each base in the k-mer
-            shap_sums[i : i + k] += value
             coverage[i : i + k] += 1
-    # Compute the average SHAP per base (avoid divide-by-zero)
     with np.errstate(divide='ignore', invalid='ignore'):
         shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
     return shap_means
-def plot_linear_heatmap(shap_means):
     """
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     """
-    # Reshape into (1, -1) so that imshow displays it as a single row
-    heatmap_data = shap_means.reshape(1, -1)
     fig, ax = plt.subplots(figsize=(12, 2))
-    # We'll use a diverging color map (red/blue)
     cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
-    # Add colorbar
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.2)
     cbar.set_label('SHAP Contribution')
-    ax.set_yticks([])  # single row, so hide the y-axis
-    ax.set_xlabel('Position in Sequence')
-    ax.set_title('Per-base SHAP Heatmap')
     plt.tight_layout()
     return fig
 ###############################################################################
-# 5. OTHER PLOTS: BAR PLOT OF TOP-K AND SEQUENCE IMPACT VISUALIZATION
 ###############################################################################
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
-    plt.figure(figsize=(10, 6))
     # Sort by absolute importance
     indices = np.argsort(np.abs(shap_values))[-top_k:]
@@ -188,83 +225,16 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
-    plt.gca().invert_yaxis()  # most important at top
-    return plt.gcf()
-def visualize_sequence_impacts(sequence, kmers, shap_values, base_prob):
-    """
-    Create a SHAP-style visualization of sequence impacts.
-    Shows each k-mer's contribution in context.
-    """
-    k = 4  # k-mer size
-    kmer_dict = {km: i for i, km in enumerate(kmers)}
-    # Find all k-mers and their impacts
-    kmer_impacts = []
-    for i in range(len(sequence) - k + 1):
-        kmer = sequence[i:i+k]
-        if kmer in kmer_dict:
-            impact = shap_values[kmer_dict[kmer]]
-            kmer_impacts.append((i, kmer, impact))
-    # Sort by absolute impact
-    kmer_impacts.sort(key=lambda x: abs(x[2]), reverse=True)
-    # Limit display to top 30 k-mers
-    display_kmers = kmer_impacts[:30]
-    # Calculate figure height based on number of k-mers
-    fig_height = min(20, max(8, len(display_kmers) * 0.4))
-    # Create figure with controlled size
-    fig = plt.figure(figsize=(12, fig_height))
-    ax = plt.gca()
-    # Add title and base value
-    plt.text(0.01, 1.02, f"base value = {base_prob:.3f}", transform=ax.transAxes, fontsize=10)
-    # Plot k-mers with controlled spacing
-    y_spacing = 0.9 / max(len(display_kmers), 1)
-    y_position = 0.95
-    for pos, kmer, impact in display_kmers:
-        pre_sequence = sequence[max(0, pos-20):pos]
-        post_sequence = sequence[pos+len(kmer):min(pos+len(kmer)+20, len(sequence))]
-        # Add ellipsis if truncated
-        pre_ellipsis = "..." if pos > 20 else ""
-        post_ellipsis = "..." if pos+len(kmer)+20 < len(sequence) else ""
-        # Choose color based on impact
-        color = '#ffcccb' if impact > 0 else '#cce0ff'
-        arrow = '↑' if impact > 0 else '↓'
-        # Draw text elements
-        plt.text(0.01, y_position, f"{pre_ellipsis}{pre_sequence}", fontsize=9)
-        plt.text(0.01 + len(f"{pre_ellipsis}{pre_sequence}")/50, y_position,
-                 kmer, fontsize=9, bbox=dict(facecolor=color, alpha=0.3, pad=1))
-        plt.text(0.01 + (len(f"{pre_ellipsis}{pre_sequence}") + len(kmer))/50,
-                 y_position, f"{post_sequence}{post_ellipsis}", fontsize=9)
-        # Add impact value
-        plt.text(0.8, y_position, f"{arrow} {impact:+.3f}", fontsize=9)
-        y_position -= y_spacing
-    plt.axis('off')
-    # Adjust layout
-    plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05)
     return fig
 ###############################################################################
-# 6. HELPER FUNCTION: FIG TO IMAGE
 ###############################################################################
 def fig_to_image(fig):
     """Convert a Matplotlib figure to a PIL Image."""
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
@@ -272,12 +242,11 @@ def fig_to_image(fig):
     plt.close(fig)
     return img
 ###############################################################################
-# 7. MAIN PREDICTION FUNCTION
 ###############################################################################
-def predict(file_obj, top_kmers=10, fasta_text=""):
     """Main prediction function for Gradio interface."""
     # Handle input
     if fasta_text.strip():
@@ -302,7 +271,6 @@ def predict(file_obj, top_kmers=10, fasta_text=""):
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         model = VirusClassifier(256).to(device)
-        # Remove 'weights_only=True' if it causes errors; it's not a standard argument.
         model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
@@ -321,31 +289,34 @@ def predict(file_obj, top_kmers=10, fasta_text=""):
         f"Sequence: {header}",
         f"Prediction: {'Human' if prob_human > 0.5 else 'Non-human'} Origin",
         f"Confidence: {max(prob_human, 1 - prob_human):.3f}",
-        f"Human Probability: {prob_human:.3f}",
-        "\nTop Contributing k-mers:"
     ]
-    # Create k-mer lists for visualization
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
-    # 1) K-mer importance bar plot
-    importance_plot = create_importance_bar_plot(shap_values, kmers, top_kmers)
-    importance_img = fig_to_image(importance_plot)
-    # 2) SHAP-style textual sequence impact
-    sequence_plot = visualize_sequence_impacts(seq, kmers, shap_values, prob_human)
-    sequence_img = fig_to_image(sequence_plot)
-    # 3) Linear heatmap across full genome
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
-    heatmap_fig = plot_linear_heatmap(shap_means)
     heatmap_img = fig_to_image(heatmap_fig)
-    return "\n".join(results), importance_img, sequence_img, heatmap_img
 ###############################################################################
-# 8. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
@@ -379,31 +350,34 @@ with gr.Blocks(css=css) as iface:
                 step=1,
                 label="Number of top k-mers to display"
             )
             submit_btn = gr.Button("Analyze Sequence", variant="primary")
         with gr.Column(scale=2):
-            results = gr.Textbox(label="Analysis Results", lines=10)
-            kmer_plot = gr.Image(label="K-mer Importance Plot")
-            shap_plot = gr.Image(label="Sequence Impact Visualization (SHAP-style)")
-            heatmap_plot = gr.Image(label="Genome Heatmap")
     submit_btn.click(
         predict,
-        inputs=[file_input, top_k, text_input],
-        outputs=[results, kmer_plot, shap_plot, heatmap_plot]
     )
     gr.Markdown("""
     ### Visualization Guide
-    - **K-mer Importance Plot**: Shows the most influential k-mers and their SHAP values
-    - **Sequence Impact Visualization**: Shows the sequence with highlighted k-mers:
-      - Red highlights = pushing toward human origin
-      - Blue highlights = pushing toward non-human origin
-      - Arrows (↑/↓) show impact direction
-      - Values show impact magnitude
-    - **Genome Heatmap**: Per-base SHAP values across the entire sequence
       - Red = push toward human
       - Blue = push toward non-human
     """)
 if __name__ == "__main__":

         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
+            original_value = x_zeroed[0, i].item()
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
             prob = probs[0, 1].item()
+            impact = baseline_prob - prob
             shap_values.append(impact)
+            x_zeroed[0, i] = original_value  # restore
     return np.array(shap_values), baseline_prob
 ###############################################################################
+# 4. PER-BASE SHAP AGGREGATION
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     seq_len = len(sequence)
     shap_sums = np.zeros(seq_len, dtype=np.float32)
     coverage = np.zeros(seq_len, dtype=np.float32)
     for i in range(seq_len - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
+            val = shap_values[kmer_dict[kmer]]
+            shap_sums[i : i + k] += val
             coverage[i : i + k] += 1
     with np.errstate(divide='ignore', invalid='ignore'):
         shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
     return shap_means
+###############################################################################
+# 5. HEATMAP PLOTS
+###############################################################################
+def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap"):
     """
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     """
+    heatmap_data = shap_means.reshape(1, -1)  # shape (1, seq_len)
     fig, ax = plt.subplots(figsize=(12, 2))
     cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
+    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.2)
+    cbar.set_label('SHAP Contribution')
+    ax.set_yticks([])
+    ax.set_xlabel('Position in Sequence')
+    ax.set_title(title)
+    plt.tight_layout()
+    return fig
+def get_top_signal_region(shap_means, window_size=500):
+    """
+    Find the window of length `window_size` that has the highest
+    sum of absolute SHAP values. Returns (start_index, end_index).
+    """
+    seq_len = len(shap_means)
+    if window_size >= seq_len:
+        return 0, seq_len  # entire sequence if window too large
+    abs_values = np.abs(shap_means)
+    max_sum = -1
+    max_start = 0
+    # Slide a window over shap_means
+    current_sum = np.sum(abs_values[:window_size])
+    max_sum = current_sum
+    for start in range(1, seq_len - window_size + 1):
+        # Remove the leftmost base, add the new rightmost base
+        current_sum = current_sum - abs_values[start-1] + abs_values[start + window_size - 1]
+        if current_sum > max_sum:
+            max_sum = current_sum
+            max_start = start
+    return max_start, max_start + window_size
+def plot_zoomed_heatmap(shap_means, window_size=500, title="Zoomed SHAP Region"):
+    """
+    Finds the region with the largest absolute SHAP sum in a fixed window,
+    then plots a 1D heatmap of just that sub-region.
+    """
+    start, end = get_top_signal_region(shap_means, window_size)
+    sub_means = shap_means[start:end].reshape(1, -1)
+    fig, ax = plt.subplots(figsize=(12, 2))
+    cax = ax.imshow(sub_means, aspect='auto', cmap='RdBu_r')
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.2)
     cbar.set_label('SHAP Contribution')
+    ax.set_yticks([])
+    ax.set_xlabel(f'Position in Sequence (zoomed in {start} - {end})')
+    ax.set_title(title)
     plt.tight_layout()
     return fig
 ###############################################################################
+# 6. OTHER PLOT: TOP-K K-MER BAR PLOT
 ###############################################################################
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
+    fig = plt.figure(figsize=(10, 5))
     # Sort by absolute importance
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
+    plt.gca().invert_yaxis()
     return fig
 ###############################################################################
+# 7. HELPER FUNCTION: FIG TO IMAGE
 ###############################################################################
 def fig_to_image(fig):
     """Convert a Matplotlib figure to a PIL Image."""
+    import io
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
     plt.close(fig)
     return img
 ###############################################################################
+# 8. MAIN PREDICTION FUNCTION
 ###############################################################################
+def predict(file_obj, top_kmers=10, fasta_text="", zoom_window=500):
     """Main prediction function for Gradio interface."""
     # Handle input
     if fasta_text.strip():
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         model = VirusClassifier(256).to(device)
         model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
         f"Sequence: {header}",
         f"Prediction: {'Human' if prob_human > 0.5 else 'Non-human'} Origin",
         f"Confidence: {max(prob_human, 1 - prob_human):.3f}",
+        f"Human Probability: {prob_human:.3f}"
     ]
+    # Create k-mer list (4-mers in lexicographic order)
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+    # 1) Top-k k-mer bar plot
+    importance_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
+    importance_img = fig_to_image(importance_fig)
+    # 2) Full-genome per-base SHAP heatmap
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
+    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide Per-base SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
+    # 3) Zoomed region (optional, using the largest absolute SHAP region)
+    if zoom_window > 0:
+        zoom_fig = plot_zoomed_heatmap(shap_means, window_size=zoom_window,
+                                       title=f"Top SHAP Region (window={zoom_window})")
+        zoom_img = fig_to_image(zoom_fig)
+    else:
+        zoom_img = None
+    return "\n".join(results), importance_img, heatmap_img, zoom_img
 ###############################################################################
+# 9. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
                 step=1,
                 label="Number of top k-mers to display"
             )
+            zoom_window = gr.Slider(
+                minimum=0,
+                maximum=5000,
+                value=500,
+                step=100,
+                label="Zoom Window Size (0 to disable zoom plot)"
+            )
             submit_btn = gr.Button("Analyze Sequence", variant="primary")
         with gr.Column(scale=2):
+            results_box = gr.Textbox(label="Analysis Results", lines=5)
+            kmer_plot = gr.Image(label="Top k-mer SHAP")
+            full_heatmap = gr.Image(label="Genome-wide SHAP Heatmap")
+            zoomed_heatmap = gr.Image(label="Zoomed SHAP Region (largest signal)")
     submit_btn.click(
         predict,
+        inputs=[file_input, top_k, text_input, zoom_window],
+        outputs=[results_box, kmer_plot, full_heatmap, zoomed_heatmap]
     )
     gr.Markdown("""
     ### Visualization Guide
+    - **Top k-mer SHAP**: Shows the most influential k-mers and their SHAP values.
+    - **Genome-wide SHAP Heatmap**: Per-base SHAP values across the entire sequence.
       - Red = push toward human
       - Blue = push toward non-human
+    - **Zoomed SHAP Region**: Shows the subregion of length 'Zoom Window Size' that has the highest absolute SHAP sum.
     """)
 if __name__ == "__main__":