Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

03f2bb5

verified ·

1 Parent(s): 9a00943

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -40

app.py CHANGED Viewed

@@ -140,28 +140,29 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     """
     Finds the subregion of length `window_size` that has the maximum
     (mode="max") or minimum (mode="min") average SHAP.
-    Returns (best_start, best_end, avg_shap).
     """
     n = len(shap_means)
     if window_size >= n:
-        # If the window is bigger than the entire sequence, return the whole seq
-        avg_val = np.mean(shap_means) if n > 0 else 0.0
         return (0, n, avg_val)
-    # For efficiency, we can do a rolling sum approach
-    csum = np.cumsum(shap_means)
-    # csum[i] = sum of shap_means[0..i-1]
-    def window_sum(start):
-        end = start + window_size
-        return csum[end] - csum[start]
     best_start = 0
-    # Initialize the best with the first window
-    best_sum = window_sum(0)
     best_avg = best_sum / window_size
     for start in range(1, n - window_size + 1):
-        wsum = window_sum(start)
         wavg = wsum / window_size
         if mode == "max":
             if wavg > best_avg:
@@ -172,7 +173,7 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
                 best_avg = wavg
                 best_start = start
-    return (best_start, best_start + window_size, best_avg)
 ###############################################################################
 # 6. PLOTTING / UTILITIES
@@ -192,10 +193,9 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     Optionally can show only a subrange (start:end).
-    We adjust layout so the colorbar is well below the x-axis:
       - orientation='horizontal', pad=0.35
-      - plt.subplots_adjust(bottom=0.4)
     """
     if start is not None and end is not None:
         shap_means = shap_means[start:end]
@@ -294,11 +294,15 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         model = VirusClassifier(256).to(device)
-        model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
-        return (f"Error loading model: {str(e)}", None, None, None, None)
     # Vectorize + scale
     freq_vector = sequence_to_kmer_vector(seq)
@@ -344,13 +348,13 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     heatmap_img = fig_to_image(heatmap_fig)
     # Store data for subregion analysis
-    state_dict = {
         "seq": seq,
         "shap_means": shap_means
     }
-    # We now return 5 items (not 6):
-    return (results_text, bar_img, heatmap_img, state_dict, header)
 ###############################################################################
 # 8. SUBREGION ANALYSIS (Gradio Step 2)
@@ -475,16 +479,10 @@ with gr.Blocks(css=css) as iface:
                 kmer_img = gr.Image(label="Top k-mer SHAP")
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
-        # State for step 2
         seq_state = gr.State()
         header_state = gr.State()
-        # analyze_sequence(...) now returns 5 items, so we have 5 outputs.
-        # 1) results_text
-        # 2) bar_img
-        # 3) heatmap_img
-        # 4) state_dict
-        # 5) header
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
@@ -517,17 +515,12 @@ with gr.Blocks(css=css) as iface:
         )
     gr.Markdown("""
-    ### What does this interface provide?
-    1. **Overall Classification** (human vs non-human), using a learned model on k-mer frequencies.
-    2. **SHAP Analysis** (ablation-based) to see which k-mer features push classification toward or away from "human".
-    3. **Genome-Wide SHAP Heatmap**: Each base's average SHAP across overlapping k-mers.
-    4. **Subregion Exploration**:
-       - Local SHAP signals (heatmap & histogram)
-       - GC content, fraction of bases pushing "human" vs "non-human"
-       - Simple logic-based interpretation based on average SHAP
-    5. **Identification of the most 'human-pushing' subregion** (max average SHAP)
-       and the most 'non-human–pushing' subregion (min average SHAP),
-       each of a chosen window size.
     """)
 if __name__ == "__main__":

     """
     Finds the subregion of length `window_size` that has the maximum
     (mode="max") or minimum (mode="min") average SHAP.
+    Returns (best_start, best_end, best_avg).
     """
     n = len(shap_means)
+    if n == 0:
+        # Edge case: empty array
+        return (0, 0, 0.0)
     if window_size >= n:
+        # If the window is bigger than the entire sequence, return entire seq
+        avg_val = float(np.mean(shap_means))
         return (0, n, avg_val)
+    # We'll build csum as length n+1 so csum[i] = sum of shap_means[:i]
+    # That means sum in [start, start+window_size) = csum[start+window_size] - csum[start].
+    csum = np.zeros(n + 1, dtype=np.float32)
+    csum[1:] = np.cumsum(shap_means)
     best_start = 0
+    # Initialize with the first window: [0, window_size)
+    best_sum = csum[window_size] - csum[0]
     best_avg = best_sum / window_size
     for start in range(1, n - window_size + 1):
+        wsum = csum[start + window_size] - csum[start]
         wavg = wsum / window_size
         if mode == "max":
             if wavg > best_avg:
                 best_avg = wavg
                 best_start = start
+    return (best_start, best_start + window_size, float(best_avg))
 ###############################################################################
 # 6. PLOTTING / UTILITIES
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     Optionally can show only a subrange (start:end).
+    Adjust layout so the colorbar is well below the x-axis:
       - orientation='horizontal', pad=0.35
+      - plt.subplots_adjust(bottom=0.4)
     """
     if start is not None and end is not None:
         shap_means = shap_means[start:end]
     # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
+        # Use weights_only=True to address the FutureWarning about untrusted pickle data
+        state_dict = torch.load('model.pt', map_location=device, weights_only=True)
         model = VirusClassifier(256).to(device)
+        model.load_state_dict(state_dict)
+        # Load scaler (warning if version mismatch)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
     # Vectorize + scale
     freq_vector = sequence_to_kmer_vector(seq)
     heatmap_img = fig_to_image(heatmap_fig)
     # Store data for subregion analysis
+    state_dict_out = {
         "seq": seq,
         "shap_means": shap_means
     }
+    # Return exactly 5 items
+    return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
 # 8. SUBREGION ANALYSIS (Gradio Step 2)
                 kmer_img = gr.Image(label="Top k-mer SHAP")
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
         seq_state = gr.State()
         header_state = gr.State()
+        # analyze_sequence(...) returns 5 items.
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
         )
     gr.Markdown("""
+    ### Interface Features
+    - **Overall Classification** (human vs non-human) using k-mer frequencies.
+    - **Top k-mer SHAP**: which k-mers push the classifier output.
+    - **Genome-Wide SHAP Heatmap**: each base's average SHAP across overlapping k-mers.
+    - **Identify Subregions** (sliding window) with the strongest push for human or non-human.
+    - **Subregion Exploration**: local SHAP heatmap & histogram, GC content, fraction of positions pushing human vs. non-human.
     """)
 if __name__ == "__main__":