Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

6d0235b

verified ·

1 Parent(s): 56468ea

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -28

app.py CHANGED Viewed

@@ -150,6 +150,7 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     Optionally can show only a subrange (start:end).
     """
     if start is not None and end is not None:
         shap_means = shap_means[start:end]
@@ -161,13 +162,18 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     fig, ax = plt.subplots(figsize=(12, 2))
     cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
-    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.2)
     cbar.set_label('SHAP Contribution')
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence')
     ax.set_title(f"{title}{subtitle}")
     plt.tight_layout()
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
@@ -187,6 +193,22 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.xlabel('SHAP Value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
     plt.gca().invert_yaxis()
     return fig
 def compute_gc_content(sequence):
@@ -281,19 +303,22 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text=""):
 def analyze_subregion(state, header, region_start, region_end):
     """
     Takes stored data from step 1 and a user-chosen region.
-    Returns a subregion heatmap and some stats (like GC content, average SHAP).
     """
     if not state or "seq" not in state or "shap_means" not in state:
-        return ("No sequence data found. Please run Step 1 first.", None)
     seq = state["seq"]
     shap_means = state["shap_means"]
     # Validate bounds
     region_start = max(0, min(region_start, len(seq)))
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
-        return ("Invalid region range. End must be > Start.", None)
     # Subsequence
     region_seq = seq[region_start:region_end]
@@ -302,23 +327,44 @@ def analyze_subregion(state, header, region_start, region_end):
     # Some stats
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
     region_info = (
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
         f"GC content: {gc_percent:.2f}%\n"
-        f"Average SHAP in region: {avg_shap:.4f} "
-        f"({'toward human' if avg_shap > 0 else 'toward non-human' if avg_shap < 0 else 'neutral'})"
     )
     # Plot region as small heatmap
-    fig = plot_linear_heatmap(shap_means,
-                              title="Subregion SHAP",
-                              start=region_start,
-                              end=region_end)
-    heatmap_img = fig_to_image(fig)
-    return (region_info, heatmap_img)
 ###############################################################################
@@ -335,7 +381,7 @@ with gr.Blocks(css=css) as iface:
     gr.Markdown("""
     # Virus Host Classifier (with Interactive Region Viewer)
     **Step 1**: Predict overall viral sequence origin (human vs non-human)
-    **Step 2**: Explore subregions to see local SHAP signals and GC content
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
@@ -368,8 +414,8 @@ with gr.Blocks(css=css) as iface:
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
         # Hidden states that store data for step 2
-        # "state" will hold (sequence, shap_means).
-        # "header" is optional meta info
         seq_state = gr.State()
         header_state = gr.State()
@@ -382,7 +428,8 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
-        Select start/end positions to view local SHAP signals.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
@@ -391,15 +438,17 @@ with gr.Blocks(css=css) as iface:
         subregion_info = gr.Textbox(
             label="Subregion Analysis",
-            lines=4,
             interactive=False
         )
-        subregion_img = gr.Image(label="Subregion SHAP Heatmap")
         region_btn.click(
             analyze_subregion,
             inputs=[seq_state, header_state, region_start, region_end],
-            outputs=[subregion_info, subregion_img]
         )
     gr.Markdown("""
@@ -407,13 +456,10 @@ with gr.Blocks(css=css) as iface:
     1. **Overall Classification** (human vs non-human), using a learned model on k-mer frequencies.
     2. **SHAP Analysis** (ablation-based) to see which k-mer features push classification toward or away from "human".
     3. **Genome-Wide SHAP Heatmap**: Each base's average SHAP across overlapping k-mers.
-    4. **Subregion Exploration**:
-       - View SHAP signals in a user-chosen region.
-       - Calculate local GC content, average SHAP, etc.
-    ### Tips
-    - For very large sequences (e.g., >100k bases), the full heatmap might be large; consider downsampling if needed.
-    - Adjust *Region Start* and *End* to explore different parts of the genome.
     """)
 if __name__ == "__main__":

     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     Optionally can show only a subrange (start:end).
+    We'll add extra bottom margin to avoid x-axis overlap.
     """
     if start is not None and end is not None:
         shap_means = shap_means[start:end]
     fig, ax = plt.subplots(figsize=(12, 2))
     cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
+    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.3)
     cbar.set_label('SHAP Contribution')
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence')
     ax.set_title(f"{title}{subtitle}")
+    # Extra spacing for x-axis labels
     plt.tight_layout()
+    # Or you can do something like:
+    # plt.subplots_adjust(bottom=0.2)
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.xlabel('SHAP Value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
     plt.gca().invert_yaxis()
+    plt.tight_layout()
+    return fig
+def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
+    """
+    Simple histogram of SHAP values in the subregion.
+    Helps see how many positions push human vs non-human.
+    """
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
+    ax.axvline(0, color='red', linestyle='--', label='0.0')
+    ax.set_xlabel("SHAP Value")
+    ax.set_ylabel("Count")
+    ax.set_title(title)
+    ax.legend()
+    plt.tight_layout()
     return fig
 def compute_gc_content(sequence):
 def analyze_subregion(state, header, region_start, region_end):
     """
     Takes stored data from step 1 and a user-chosen region.
+    Returns a subregion heatmap, histogram, and some stats (GC, average SHAP).
     """
     if not state or "seq" not in state or "shap_means" not in state:
+        return ("No sequence data found. Please run Step 1 first.", None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
     # Validate bounds
+    region_start = int(region_start)
+    region_end = int(region_end)
     region_start = max(0, min(region_start, len(seq)))
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
+        return ("Invalid region range. End must be > Start.", None, None)
     # Subsequence
     region_seq = seq[region_start:region_end]
     # Some stats
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
+    # Fraction pushing toward human vs. non-human
+    positive_fraction = np.mean(region_shap > 0)
+    negative_fraction = np.mean(region_shap < 0)
+    # Simple logic-based interpretation
+    # Adjust thresholds as needed
+    if avg_shap > 0.05:
+        region_classification = "Likely pushing toward human"
+    elif avg_shap < -0.05:
+        region_classification = "Likely pushing toward non-human"
+    else:
+        region_classification = "Near neutral (no strong push)"
     region_info = (
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
         f"GC content: {gc_percent:.2f}%\n"
+        f"Average SHAP in region: {avg_shap:.4f}\n"
+        f"Fraction with SHAP > 0 (toward human): {positive_fraction:.2f}\n"
+        f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
+        f"Subregion interpretation: {region_classification}\n"
     )
     # Plot region as small heatmap
+    heatmap_fig = plot_linear_heatmap(
+        shap_means,
+        title="Subregion SHAP",
+        start=region_start,
+        end=region_end
+    )
+    heatmap_img = fig_to_image(heatmap_fig)
+    # Plot histogram of SHAP in region
+    hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
+    hist_img = fig_to_image(hist_fig)
+    return (region_info, heatmap_img, hist_img)
 ###############################################################################
     gr.Markdown("""
     # Virus Host Classifier (with Interactive Region Viewer)
     **Step 1**: Predict overall viral sequence origin (human vs non-human)
+    **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
     """)
     with gr.Tab("1) Full-Sequence Analysis"):
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
         # Hidden states that store data for step 2
+        # "seq_state" will hold { seq, shap_means }.
+        # "header_state" is optional meta info
         seq_state = gr.State()
         header_state = gr.State()
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
+        **Subregion Analysis**
+        Select start/end positions to view local SHAP signals, distribution, and GC content.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
         subregion_info = gr.Textbox(
             label="Subregion Analysis",
+            lines=7,
             interactive=False
         )
+        with gr.Row():
+            subregion_img = gr.Image(label="Subregion SHAP Heatmap")
+            subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
         region_btn.click(
             analyze_subregion,
             inputs=[seq_state, header_state, region_start, region_end],
+            outputs=[subregion_info, subregion_img, subregion_hist_img]
         )
     gr.Markdown("""
     1. **Overall Classification** (human vs non-human), using a learned model on k-mer frequencies.
     2. **SHAP Analysis** (ablation-based) to see which k-mer features push classification toward or away from "human".
     3. **Genome-Wide SHAP Heatmap**: Each base's average SHAP across overlapping k-mers.
+    4. **Subregion Exploration**:
+       - Local SHAP signals (heatmap & histogram)
+       - GC content, fraction of bases pushing "human" vs "non-human"
+       - Simple logic-based interpretation based on average SHAP
     """)
 if __name__ == "__main__":