Spaces:

hiyata
/

HostClassifier

Sleeping

App Files Files Community

hiyata commited on Jan 12

Commit

56468ea

verified ·

1 Parent(s): d76e76a

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -160

app.py CHANGED Viewed

@@ -32,7 +32,6 @@ class VirusClassifier(nn.Module):
     def forward(self, x):
         return self.network(x)
 ###############################################################################
 # 2. FASTA PARSING & K-MER FEATURE ENGINEERING
 ###############################################################################
@@ -59,7 +58,7 @@ def parse_fasta(text):
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
-    """Convert a sequence to a k-mer frequency vector."""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
@@ -75,7 +74,6 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     return vec
 ###############################################################################
 # 3. SHAP-VALUE (ABLATION) CALCULATION
 ###############################################################################
@@ -83,30 +81,29 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
 def calculate_shap_values(model, x_tensor):
     """
     Calculate SHAP values using a simple ablation approach.
-    Returns shap values and model prediction.
     """
     model.eval()
     with torch.no_grad():
-        # Get baseline prediction
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
         baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human' class
-        # Calculate impact of zeroing each feature
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
-            original_value = x_zeroed[0, i].item()
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
             prob = probs[0, 1].item()
             impact = baseline_prob - prob
             shap_values.append(impact)
-            x_zeroed[0, i] = original_value  # restore
     return np.array(shap_values), baseline_prob
 ###############################################################################
 # 4. PER-BASE SHAP AGGREGATION
 ###############################################################################
@@ -116,7 +113,6 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
     Returns an array of per-base SHAP contributions by averaging
     the k-mer SHAP values of all k-mers covering that base.
     """
-    # Create the list of k-mers (in lexicographic order)
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
@@ -136,79 +132,44 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
     return shap_means
 ###############################################################################
-# 5. HEATMAP PLOTS
 ###############################################################################
-def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap"):
     """
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     """
-    heatmap_data = shap_means.reshape(1, -1)  # shape (1, seq_len)
-    fig, ax = plt.subplots(figsize=(12, 2))
     cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.2)
     cbar.set_label('SHAP Contribution')
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence')
-    ax.set_title(title)
-    plt.tight_layout()
-    return fig
-def get_top_signal_region(shap_means, window_size=500):
-    """
-    Find the window of length `window_size` that has the highest
-    sum of absolute SHAP values. Returns (start_index, end_index).
-    """
-    seq_len = len(shap_means)
-    if window_size >= seq_len:
-        return 0, seq_len  # entire sequence if window too large
-    abs_values = np.abs(shap_means)
-    max_sum = -1
-    max_start = 0
-    # Slide a window over shap_means
-    current_sum = np.sum(abs_values[:window_size])
-    max_sum = current_sum
-    for start in range(1, seq_len - window_size + 1):
-        # Remove the leftmost base, add the new rightmost base
-        current_sum = current_sum - abs_values[start-1] + abs_values[start + window_size - 1]
-        if current_sum > max_sum:
-            max_sum = current_sum
-            max_start = start
-    return max_start, max_start + window_size
-def plot_zoomed_heatmap(shap_means, window_size=500, title="Zoomed SHAP Region"):
-    """
-    Finds the region with the largest absolute SHAP sum in a fixed window,
-    then plots a 1D heatmap of just that sub-region.
-    """
-    start, end = get_top_signal_region(shap_means, window_size)
-    sub_means = shap_means[start:end].reshape(1, -1)
-    fig, ax = plt.subplots(figsize=(12, 2))
-    cax = ax.imshow(sub_means, aspect='auto', cmap='RdBu_r')
-    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.2)
-    cbar.set_label('SHAP Contribution')
-    ax.set_yticks([])
-    ax.set_xlabel(f'Position in Sequence (zoomed in {start} - {end})')
-    ax.set_title(title)
     plt.tight_layout()
     return fig
-###############################################################################
-# 6. OTHER PLOT: TOP-K K-MER BAR PLOT
-###############################################################################
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
@@ -223,31 +184,24 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
-    plt.xlabel('SHAP value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
     plt.gca().invert_yaxis()
     return fig
-###############################################################################
-# 7. HELPER FUNCTION: FIG TO IMAGE
-###############################################################################
-def fig_to_image(fig):
-    """Convert a Matplotlib figure to a PIL Image."""
-    import io
-    buf = io.BytesIO()
-    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
-    buf.seek(0)
-    img = Image.open(buf)
-    plt.close(fig)
-    return img
 ###############################################################################
-# 8. MAIN PREDICTION FUNCTION
 ###############################################################################
-def predict(file_obj, top_kmers=10, fasta_text="", zoom_window=500):
-    """Main prediction function for Gradio interface."""
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
@@ -256,14 +210,14 @@ def predict(file_obj, top_kmers=10, fasta_text="", zoom_window=500):
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
-            return f"Error reading file: {str(e)}", None, None, None
     else:
-        return "Please provide a FASTA sequence.", None, None, None
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
-        return "No valid FASTA sequences found.", None, None, None
     header, seq = sequences[0]
@@ -274,49 +228,101 @@ def predict(file_obj, top_kmers=10, fasta_text="", zoom_window=500):
         model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
-        return f"Error loading model: {str(e)}", None, None, None
-    # Generate features
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
-    # Calculate SHAP values and get prediction
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
-    # Prediction text
-    results = [
-        f"Sequence: {header}",
-        f"Prediction: {'Human' if prob_human > 0.5 else 'Non-human'} Origin",
-        f"Confidence: {max(prob_human, 1 - prob_human):.3f}",
-        f"Human Probability: {prob_human:.3f}"
-    ]
-    # Create k-mer list (4-mers in lexicographic order)
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
-    # 1) Top-k k-mer bar plot
-    importance_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
-    importance_img = fig_to_image(importance_fig)
-    # 2) Full-genome per-base SHAP heatmap
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
-    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide Per-base SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
-    # 3) Zoomed region (optional, using the largest absolute SHAP region)
-    if zoom_window > 0:
-        zoom_fig = plot_zoomed_heatmap(shap_means, window_size=zoom_window,
-                                       title=f"Top SHAP Region (window={zoom_window})")
-        zoom_img = fig_to_image(zoom_fig)
-    else:
-        zoom_img = None
-    return "\n".join(results), importance_img, heatmap_img, zoom_img
 ###############################################################################
-# 9. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
@@ -327,57 +333,87 @@ css = """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
-    # Virus Host Classifier
-    Predicts whether a viral sequence is of human or non-human origin using k-mer analysis.
     """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            file_input = gr.File(
-                label="Upload FASTA file",
-                file_types=[".fasta", ".fa", ".txt"],
-                type="filepath"
-            )
-            text_input = gr.Textbox(
-                label="Or paste FASTA sequence",
-                placeholder=">sequence_name\nACGTACGT...",
-                lines=5
-            )
-            top_k = gr.Slider(
-                minimum=5,
-                maximum=30,
-                value=10,
-                step=1,
-                label="Number of top k-mers to display"
-            )
-            zoom_window = gr.Slider(
-                minimum=0,
-                maximum=5000,
-                value=500,
-                step=100,
-                label="Zoom Window Size (0 to disable zoom plot)"
-            )
-            submit_btn = gr.Button("Analyze Sequence", variant="primary")
-        with gr.Column(scale=2):
-            results_box = gr.Textbox(label="Analysis Results", lines=5)
-            kmer_plot = gr.Image(label="Top k-mer SHAP")
-            full_heatmap = gr.Image(label="Genome-wide SHAP Heatmap")
-            zoomed_heatmap = gr.Image(label="Zoomed SHAP Region (largest signal)")
-    submit_btn.click(
-        predict,
-        inputs=[file_input, top_k, text_input, zoom_window],
-        outputs=[results_box, kmer_plot, full_heatmap, zoomed_heatmap]
-    )
     gr.Markdown("""
-    ### Visualization Guide
-    - **Top k-mer SHAP**: Shows the most influential k-mers and their SHAP values.
-    - **Genome-wide SHAP Heatmap**: Per-base SHAP values across the entire sequence.
-      - Red = push toward human
-      - Blue = push toward non-human
-    - **Zoomed SHAP Region**: Shows the subregion of length 'Zoom Window Size' that has the highest absolute SHAP sum.
     """)
 if __name__ == "__main__":

     def forward(self, x):
         return self.network(x)
 ###############################################################################
 # 2. FASTA PARSING & K-MER FEATURE ENGINEERING
 ###############################################################################
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
+    """Convert a sequence to a k-mer frequency vector for classification."""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
     return vec
 ###############################################################################
 # 3. SHAP-VALUE (ABLATION) CALCULATION
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
     """
     Calculate SHAP values using a simple ablation approach.
+    Returns shap_values, prob_human
     """
     model.eval()
     with torch.no_grad():
+        # Baseline
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
         baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human' class
+        # Zeroing each feature to measure impact
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
+            original_val = x_zeroed[0, i].item()
             x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
             prob = probs[0, 1].item()
             impact = baseline_prob - prob
             shap_values.append(impact)
+            x_zeroed[0, i] = original_val  # restore
     return np.array(shap_values), baseline_prob
 ###############################################################################
 # 4. PER-BASE SHAP AGGREGATION
 ###############################################################################
     Returns an array of per-base SHAP contributions by averaging
     the k-mer SHAP values of all k-mers covering that base.
     """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     return shap_means
 ###############################################################################
+# 5. PLOTTING / UTILITIES
 ###############################################################################
+def fig_to_image(fig):
+    """Convert a Matplotlib figure to a PIL Image for Gradio."""
+    buf = io.BytesIO()
+    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
+    buf.seek(0)
+    img = Image.open(buf)
+    plt.close(fig)
+    return img
+def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     """
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
+    Optionally can show only a subrange (start:end).
     """
+    if start is not None and end is not None:
+        shap_means = shap_means[start:end]
+        subtitle = f" (positions {start}-{end})"
+    else:
+        subtitle = ""
+    heatmap_data = shap_means.reshape(1, -1)  # shape (1, region_length)
+    fig, ax = plt.subplots(figsize=(12, 2))
     cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.2)
     cbar.set_label('SHAP Contribution')
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence')
+    ax.set_title(f"{title}{subtitle}")
     plt.tight_layout()
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
+    plt.xlabel('SHAP Value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
     plt.gca().invert_yaxis()
     return fig
+def compute_gc_content(sequence):
+    """Compute %GC in the sequence (A, C, G, T)."""
+    if not sequence:
+        return 0
+    gc_count = sequence.count('G') + sequence.count('C')
+    return (gc_count / len(sequence)) * 100.0
 ###############################################################################
+# 6. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
+def analyze_sequence(file_obj, top_kmers=10, fasta_text=""):
+    """Analyzes the entire genome, returning classification and a heatmap."""
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
+            return (f"Error reading file: {str(e)}", None, None, None, None)
     else:
+        return ("Please provide a FASTA sequence.", None, None, None, None)
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
+        return ("No valid FASTA sequences found.", None, None, None, None)
     header, seq = sequences[0]
         model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return (f"Error loading model: {str(e)}", None, None, None, None)
+    # Vectorize + scale
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
+    # SHAP + classification
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
+    prob_nonhuman = 1.0 - prob_human
+    classification = "Human" if prob_human > 0.5 else "Non-human"
+    confidence = max(prob_human, prob_nonhuman)
+    # Build results text
+    results_text = (
+        f"Sequence: {header}\n"
+        f"Length: {len(seq):,} bases\n"
+        f"Classification: {classification}\n"
+        f"Confidence: {confidence:.3f}\n"
+        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})"
+    )
+    # K-mer importance plot
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+    bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
+    bar_img = fig_to_image(bar_fig)
+    # Per-base SHAP for entire genome
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
+    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
+    # Return:
+    # 1) results text
+    # 2) k-mer bar image
+    # 3) full-genome heatmap
+    # 4) the "state" we need for step 2: (sequence, shap_means)
+    #    We'll store these in a dictionary so we can pass it around in Gradio.
+    state_dict = {
+        "seq": seq,
+        "shap_means": shap_means
+    }
+    return (results_text, bar_img, heatmap_img, state_dict, header)
+###############################################################################
+# 7. SUBREGION ANALYSIS (Gradio Step 2)
+###############################################################################
+def analyze_subregion(state, header, region_start, region_end):
+    """
+    Takes stored data from step 1 and a user-chosen region.
+    Returns a subregion heatmap and some stats (like GC content, average SHAP).
+    """
+    if not state or "seq" not in state or "shap_means" not in state:
+        return ("No sequence data found. Please run Step 1 first.", None)
+    seq = state["seq"]
+    shap_means = state["shap_means"]
+    # Validate bounds
+    region_start = max(0, min(region_start, len(seq)))
+    region_end = max(0, min(region_end, len(seq)))
+    if region_end <= region_start:
+        return ("Invalid region range. End must be > Start.", None)
+    # Subsequence
+    region_seq = seq[region_start:region_end]
+    region_shap = shap_means[region_start:region_end]
+    # Some stats
+    gc_percent = compute_gc_content(region_seq)
+    avg_shap = float(np.mean(region_shap))
+    region_info = (
+        f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
+        f"Region length: {len(region_seq)} bases\n"
+        f"GC content: {gc_percent:.2f}%\n"
+        f"Average SHAP in region: {avg_shap:.4f} "
+        f"({'toward human' if avg_shap > 0 else 'toward non-human' if avg_shap < 0 else 'neutral'})"
+    )
+    # Plot region as small heatmap
+    fig = plot_linear_heatmap(shap_means,
+                              title="Subregion SHAP",
+                              start=region_start,
+                              end=region_end)
+    heatmap_img = fig_to_image(fig)
+    return (region_info, heatmap_img)
 ###############################################################################
+# 8. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
+    # Virus Host Classifier (with Interactive Region Viewer)
+    **Step 1**: Predict overall viral sequence origin (human vs non-human)
+    **Step 2**: Explore subregions to see local SHAP signals and GC content
     """)
+    with gr.Tab("1) Full-Sequence Analysis"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_input = gr.File(
+                    label="Upload FASTA file",
+                    file_types=[".fasta", ".fa", ".txt"],
+                    type="filepath"
+                )
+                text_input = gr.Textbox(
+                    label="Or paste FASTA sequence",
+                    placeholder=">sequence_name\nACGTACGT...",
+                    lines=5
+                )
+                top_k = gr.Slider(
+                    minimum=5,
+                    maximum=30,
+                    value=10,
+                    step=1,
+                    label="Number of top k-mers to display"
+                )
+                analyze_btn = gr.Button("Analyze Sequence", variant="primary")
+            with gr.Column(scale=2):
+                results_box = gr.Textbox(
+                    label="Classification Results", lines=7, interactive=False
+                )
+                kmer_img = gr.Image(label="Top k-mer SHAP")
+                genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
+        # Hidden states that store data for step 2
+        # "state" will hold (sequence, shap_means).
+        # "header" is optional meta info
+        seq_state = gr.State()
+        header_state = gr.State()
+        # The "analyze_sequence" function returns 5 values, which we map here:
+        analyze_btn.click(
+            analyze_sequence,
+            inputs=[file_input, top_k, text_input],
+            outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
+        )
+    with gr.Tab("2) Subregion Exploration"):
+        gr.Markdown("""
+        Select start/end positions to view local SHAP signals.
+        """)
+        with gr.Row():
+            region_start = gr.Number(label="Region Start", value=0)
+            region_end = gr.Number(label="Region End", value=500)
+            region_btn = gr.Button("Analyze Subregion")
+        subregion_info = gr.Textbox(
+            label="Subregion Analysis",
+            lines=4,
+            interactive=False
+        )
+        subregion_img = gr.Image(label="Subregion SHAP Heatmap")
+        region_btn.click(
+            analyze_subregion,
+            inputs=[seq_state, header_state, region_start, region_end],
+            outputs=[subregion_info, subregion_img]
+        )
     gr.Markdown("""
+    ### What does this interface provide?
+    1. **Overall Classification** (human vs non-human), using a learned model on k-mer frequencies.
+    2. **SHAP Analysis** (ablation-based) to see which k-mer features push classification toward or away from "human".
+    3. **Genome-Wide SHAP Heatmap**: Each base's average SHAP across overlapping k-mers.
+    4. **Subregion Exploration**:
+       - View SHAP signals in a user-chosen region.
+       - Calculate local GC content, average SHAP, etc.
+    ### Tips
+    - For very large sequences (e.g., >100k bases), the full heatmap might be large; consider downsampling if needed.
+    - Adjust *Region Start* and *End* to explore different parts of the genome.
     """)
 if __name__ == "__main__":