Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

ef80028

verified ·

1 Parent(s): f1d4be6

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -310

app.py CHANGED Viewed

@@ -8,10 +8,6 @@ import matplotlib.pyplot as plt
 import io
 from PIL import Image
-##############################################################################
-# MODEL DEFINITION
-##############################################################################
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         super(VirusClassifier, self).__init__()
@@ -32,10 +28,6 @@ class VirusClassifier(nn.Module):
     def forward(self, x):
         return self.network(x)
-##############################################################################
-# UTILITIES
-##############################################################################
 def parse_fasta(text):
     """
     Parses FASTA formatted text into a list of (header, sequence).
@@ -61,7 +53,7 @@ def parse_fasta(text):
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """
-    Convert a sequence to a k-mer frequency vector of size len(ACGT^k).
     """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
@@ -78,355 +70,218 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     return vec
-def ablation_importance(model, x_tensor):
     """
-    Calculates a simple ablation-based importance measure for each feature:
-    1. Compute baseline human probability p_base.
-    2. For each feature i, set x[i] = 0, re-run inference, compute new p, and
-       measure delta = p_base - p.
-    3. Return array of deltas (positive means that removing that feature
-       *decreases* the probability => that feature was pushing it higher).
     """
     model.eval()
     with torch.no_grad():
-        # Baseline probability
-        output = model(x_tensor)
-        probs = torch.softmax(output, dim=1)
-        p_base = probs[0, 1].item()
-    # Store the delta importances
-    importances = np.zeros(x_tensor.shape[1], dtype=np.float32)
-    # For efficiency, we do ablation one feature at a time
-    for i in range(x_tensor.shape[1]):
-        x_copy = x_tensor.clone()
-        x_copy[0, i] = 0.0  # Ablate this feature
-        with torch.no_grad():
-            output_ablation = model(x_copy)
-            probs_ablation = torch.softmax(output_ablation, dim=1)
-            p_ablation = probs_ablation[0, 1].item()
-        # Delta
-        importances[i] = p_base - p_ablation
-    return importances, p_base
-##############################################################################
-# PLOTTING
-##############################################################################
-def create_step_and_frequency_plot(important_kmers, human_prob, title):
     """
-    Creates a combined step plot (showing how each k-mer modifies the probability)
-    and a frequency vs. sigma bar chart.
     """
-    fig = plt.figure(figsize=(15, 10))
-    # Create grid for subplots
-    gs = plt.GridSpec(2, 1, height_ratios=[1.5, 1], hspace=0.3)
-    # 1. Probability Step Plot
-    ax1 = plt.subplot(gs[0])
-    current_prob = 0.5
-    steps = [('Start', current_prob, 0)]
-    for kmer_info in important_kmers:
-        change = kmer_info['impact']  # positive => pushes up, negative => pushes down
-        current_prob += change
-        steps.append((kmer_info['kmer'], current_prob, change))
-    x = range(len(steps))
-    y = [step[1] for step in steps]
-    # Plot steps
-    ax1.step(x, y, 'b-', where='post', label='Probability', linewidth=2)
-    ax1.plot(x, y, 'b.', markersize=10)
-    # Add reference line
-    ax1.axhline(y=0.5, color='r', linestyle='--', label='Neutral (0.5)')
-    # Customize plot
-    ax1.grid(True, linestyle='--', alpha=0.7)
-    ax1.set_ylim(0, 1)
-    ax1.set_ylabel('Human Probability')
-    ax1.set_title(f'K-mer Contributions to Prediction (final prob: {human_prob:.3f})')
-    # Add labels for each point
-    for i, (kmer, prob, change) in enumerate(steps):
-        # Add k-mer label
-        ax1.annotate(kmer,
-                     (i, prob),
-                     xytext=(0, 10 if i % 2 == 0 else -20),
-                     textcoords='offset points',
-                     ha='center',
-                     rotation=45)
-        # Add change value
-        if i > 0:
-            change_text = f'{change:+.3f}'
-            color = 'green' if change > 0 else 'red'
-            ax1.annotate(change_text,
-                         (i, prob),
-                         xytext=(0, -20 if i % 2 == 0 else 10),
-                         textcoords='offset points',
-                         ha='center',
-                         color=color)
-    ax1.legend()
-    # 2. K-mer Frequency and Sigma Plot
-    ax2 = plt.subplot(gs[1])
-    # Prepare data
-    kmers = [k['kmer'] for k in important_kmers]
-    frequencies = [k['occurrence'] for k in important_kmers]
-    sigmas = [k['sigma'] for k in important_kmers]
-    # Color the bars: if impact>0 => green, else red
-    colors = ['g' if k['impact'] > 0 else 'r' for k in important_kmers]
-    # Create bar plot for frequencies
-    x = np.arange(len(kmers))
-    width = 0.35
-    ax2.bar(x - width/2, frequencies, width, label='Frequency (%)', color=colors, alpha=0.6)
-    # Twin axis for sigma
-    ax2_twin = ax2.twinx()
-    # To highlight positive or negative sigma, pick color accordingly
-    sigma_colors = []
-    for s, c in zip(sigmas, colors):
-        if s >= 0:
-            sigma_colors.append('blue')  # above average
-        else:
-            sigma_colors.append('gray')  # below average
-    ax2_twin.bar(x + width/2, sigmas, width, label='σ from Mean', color=sigma_colors, alpha=0.3)
-    # Customize plot
-    ax2.set_xticks(x)
-    ax2.set_xticklabels(kmers, rotation=45)
-    ax2.set_ylabel('Frequency (%)')
-    ax2_twin.set_ylabel('Standard Deviations (σ) from Mean')
-    ax2.set_title('K-mer Frequencies and Statistical Significance')
-    # Add legends
-    lines1, labels1 = ax2.get_legend_handles_labels()
-    lines2, labels2 = ax2_twin.get_legend_handles_labels()
-    ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
-    plt.tight_layout()
     return fig
-def create_shap_like_bar_plot(impact_values, kmer_list, top_k):
     """
-    Creates a horizontal bar plot showing the top_k features by absolute impact.
-    impact_values: array of float (length=256).
-    kmer_list: list of all k=4 kmers in order.
-    top_k: integer, how many top features to display.
     """
-    # Sort by absolute impact
-    indices_sorted = np.argsort(np.abs(impact_values))[::-1]
-    top_indices = indices_sorted[:top_k]
-    top_impacts = impact_values[top_indices]
-    top_kmers = [kmer_list[i] for i in top_indices]
-    fig = plt.figure(figsize=(8, 6))
-    plt.barh(range(len(top_impacts)), top_impacts, color=['green' if i > 0 else 'red' for i in top_impacts])
-    plt.yticks(range(len(top_impacts)), top_kmers)
-    plt.xlabel("Impact on Human Probability (Ablation)")
-    plt.title(f"Top {top_k} K-mers by Absolute Impact")
-    plt.gca().invert_yaxis()  # Highest at top
-    plt.tight_layout()
-    return fig
-def create_global_bar_plot(impact_values, kmer_list):
-    """
-    Creates a bar plot for ALL features (256) to see the global distribution.
-    """
-    fig = plt.figure(figsize=(12, 6))
-    indices_sorted = np.argsort(np.abs(impact_values))[::-1]
-    sorted_impacts = impact_values[indices_sorted]
-    sorted_kmers = [kmer_list[i] for i in indices_sorted]
-    plt.bar(range(len(sorted_impacts)), sorted_impacts,
-            color=['green' if i > 0 else 'red' for i in sorted_impacts])
-    plt.title("Global Impact of All 256 K-mers (Ablation Method)")
-    plt.xlabel("K-mer (sorted by |impact|)")
-    plt.ylabel("Impact on Human Probability")
-    # Optionally, we can skip labeling all 256 on x-axis.
-    # But we can show only the top/bottom or none for clarity.
-    plt.tight_layout()
     return fig
-##############################################################################
-# MAIN PREDICTION FUNCTION
-##############################################################################
-def predict(file_obj, top_kmers=10, advanced_plots=False, fasta_text=""):
     """
-    Main prediction function called by Gradio.
-    - file_obj: optional uploaded FASTA file
-    - top_kmers: number of top k-mers to display in the main SHAP-like plot
-    - advanced_plots: bool, whether to return global bar plots
-    - fasta_text: optional direct-pasted FASTA text
     """
-    # Priority: If user pasted text, use that; otherwise use uploaded file.
     if fasta_text.strip():
         text = fasta_text.strip()
-    else:
-        if file_obj is None:
-            return "No FASTA input provided", None, None, None
         try:
-            if isinstance(file_obj, str):
-                text = file_obj
-            else:
-                text = file_obj.decode('utf-8')
         except Exception as e:
-            return f"Error reading file: {str(e)}", None, None, None
     # Parse FASTA
     sequences = parse_fasta(text)
-    if len(sequences) == 0:
-        return "No valid FASTA sequences found", None, None, None
     header, seq = sequences[0]
-    # Load model + scaler
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    model = VirusClassifier(256).to(device)
     try:
-        state_dict = torch.load('model.pt', map_location=device)
-        model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
-        return f"Error loading model or scaler: {str(e)}", None, None, None
-    # Prepare the vector
-    raw_freq_vector = sequence_to_kmer_vector(seq, k=4)
-    scaled_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
-    X_tensor = torch.FloatTensor(scaled_vector).to(device)
-    # Compute ablation-based importances
-    importances, p_base = ablation_importance(model, X_tensor)
-    # p_base is baseline human probability
-    # We also want frequency in % and sigma from mean
-    # If your scaler is e.g. StandardScaler, then "scaled_vector[0][i]" is
-    # how many std devs from the mean that feature is.
-    # We'll gather info in a list of dicts for each k-mer.
-    kmers_4 = [''.join(p) for p in product("ACGT", repeat=4)]
-    kmer_dict = {km: i for i, km in enumerate(kmers_4)}
-    # We'll sort by absolute impact to get the top 10 by default.
-    abs_sorted_idx = np.argsort(np.abs(importances))[::-1]
-    # But for the final step/frequency plot we only show top_kmers
-    top_indices = abs_sorted_idx[:top_kmers]
-    # Build a list of the top k-mers
     important_kmers = []
-    for idx in top_indices:
-        # "impact" is how much that feature changed the probability
-        impact = importances[idx]
-        # raw frequency => raw_freq_vector[idx] * 100 for %
-        freq_pct = float(raw_freq_vector[idx] * 100.0)
-        # sigma => scaled_vector[0][idx]
-        sigma_val = float(scaled_vector[0][idx])
         important_kmers.append({
-            'kmer': kmers_4[idx],
-            'impact': impact,
-            'occurrence': freq_pct,
-            'sigma': sigma_val
         })
-    # For text output
-    # We decide final class based on model's direct output
-    with torch.no_grad():
-        output = model(X_tensor)
-        probs = torch.softmax(output, dim=1)
-    pred_class = 1 if probs[0,1] > probs[0,0] else 0
-    pred_label = 'human' if pred_class == 1 else 'non-human'
-    human_prob = probs[0,1].item()
-    nonhuman_prob = probs[0,0].item()
-    confidence = max(human_prob, nonhuman_prob)
-    results_text = (f"Sequence: {header}\n"
-                    f"Prediction: {pred_label}\n"
-                    f"Confidence: {confidence:.4f}\n"
-                    f"Human probability: {human_prob:.4f}\n"
-                    f"Non-human probability: {nonhuman_prob:.4f}\n"
-                    f"Most influential k-mers (by ablation impact):\n")
-    for kmer_info in important_kmers:
-        # sign => if impact>0 => removing it lowers p(human), so it was pushing p(human) up
-        direction = "UP (toward human)" if kmer_info['impact'] > 0 else "DOWN (toward non-human)"
-        results_text += (
-            f"  {kmer_info['kmer']}: {direction}, "
-            f"Impact={kmer_info['impact']:.4f}, "
-            f"Occ={kmer_info['occurrence']:.2f}% of seq, "
-            f"{abs(kmer_info['sigma']):.2f}σ "
-            + ("above" if kmer_info['sigma']>0 else "below")
-            + " mean\n"
         )
-    # PLOT 1: A SHAP-like bar plot for the top K features
-    shap_fig = create_shap_like_bar_plot(importances, kmers_4, top_kmers)
-    # PLOT 2: Step + frequency plot for the top K features
-    step_fig = create_step_and_frequency_plot(important_kmers, human_prob, header)
-    # PLOT 3 (optional advanced): global bar plot of all 256 features
-    global_fig = None
-    if advanced_plots:
-        global_fig = create_global_bar_plot(importances, kmers_4)
-    # Convert figures to PIL Images
     def fig_to_image(fig):
         buf = io.BytesIO()
-        fig.savefig(buf, format='png', bbox_inches='tight', dpi=200)
         buf.seek(0)
-        im = Image.open(buf)
         plt.close(fig)
-        return im
-    shap_img = fig_to_image(shap_fig)
-    step_img = fig_to_image(step_fig)
-    if global_fig is not None:
-        global_img = fig_to_image(global_fig)
-    else:
-        global_img = None
-    return results_text, shap_img, step_img, global_img
-##############################################################################
-# GRADIO INTERFACE
-##############################################################################
-title_text = "Virus Host Classifier"
-description_text = """
-Upload or paste a FASTA sequence to predict if it's likely **human** or **non-human** origin.
-- **k=4** k-mers are used as features.
-- We display ablation-based feature importance for interpretability.
-- Advanced plots can be toggled to see the global distribution of all 256 k-mer impacts.
 """
-iface = gr.Interface(
-    fn=predict,
-    inputs=[
-        gr.File(label="Upload FASTA file", type="binary", optional=True),
-        gr.Slider(label="Number of top k-mers to show", minimum=1, maximum=50, value=10, step=1),
-        gr.Checkbox(label="Show advanced (global) plots?", value=False),
-        gr.Textbox(label="Or paste FASTA text here", lines=5, placeholder=">header\nACGTACGT...")
-    ],
-    outputs=[
-        gr.Textbox(label="Results", lines=10),
-        gr.Image(label="SHAP-like Top-k K-mer Bar Plot"),
-        gr.Image(label="Step & Frequency Plot (Top-k)"),
-        gr.Image(label="Global 256-K-mer Plot (advanced)", optional=True)
-    ],
-    title=title_text,
-    description=description_text
-)
 if __name__ == "__main__":
-    iface.launch(share=True)

 import io
 from PIL import Image
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         super(VirusClassifier, self).__init__()
     def forward(self, x):
         return self.network(x)
 def parse_fasta(text):
     """
     Parses FASTA formatted text into a list of (header, sequence).
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """
+    Convert a sequence to a k-mer frequency vector.
     """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     return vec
+def calculate_shap_values(model, x_tensor):
     """
+    Calculate SHAP-like values using a simple ablation approach.
     """
     model.eval()
     with torch.no_grad():
+        baseline_output = model(x_tensor)
+        baseline_prob = torch.softmax(baseline_output, dim=1)[0, 1].item()
+        shap_values = []
+        for i in range(x_tensor.shape[1]):
+            perturbed_input = x_tensor.clone()
+            perturbed_input[0, i] = 0  # Ablate feature
+            output = model(perturbed_input)
+            prob = torch.softmax(output, dim=1)[0, 1].item()
+            shap_values.append(baseline_prob - prob)
+    return np.array(shap_values), baseline_prob
+def create_importance_plot(shap_values, kmers, top_k=10):
     """
+    Create horizontal bar plot of feature importance.
     """
+    plt.style.use('seaborn')
+    fig = plt.figure(figsize=(10, 8))
+    # Sort by absolute importance
+    indices = np.argsort(np.abs(shap_values))[-top_k:]
+    values = shap_values[indices]
+    features = [kmers[i] for i in indices]
+    colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]
+    plt.barh(range(len(values)), values, color=colors)
+    plt.yticks(range(len(values)), features)
+    plt.xlabel('Impact on Prediction (SHAP value)')
+    plt.title(f'Top {top_k} Most Influential k-mers')
+    plt.gca().invert_yaxis()
     return fig
+def create_contribution_plot(important_kmers, final_prob):
     """
+    Create waterfall plot showing cumulative feature contributions.
     """
+    plt.style.use('seaborn')
+    fig = plt.figure(figsize=(12, 6))
+    base_prob = 0.5
+    cumulative = [base_prob]
+    labels = ['Base']
+    for kmer_info in important_kmers:
+        cumulative.append(cumulative[-1] + kmer_info['impact'])
+        labels.append(kmer_info['kmer'])
+    plt.plot(range(len(cumulative)), cumulative, 'b-o', linewidth=2)
+    plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
+    plt.xticks(range(len(labels)), labels, rotation=45)
+    plt.ylim(0, 1)
+    plt.grid(True, alpha=0.3)
+    plt.title('Cumulative Feature Contributions')
+    plt.ylabel('Probability of Human Origin')
     return fig
+def predict(file_obj, top_kmers=10, fasta_text=""):
     """
+    Main prediction function for the Gradio interface.
     """
+    # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
+    elif file_obj is not None:
         try:
+            text = file_obj.decode('utf-8')
         except Exception as e:
+            return f"Error reading file: {str(e)}", None, None
+    else:
+        return "Please provide a FASTA sequence either by file upload or text input.", None, None
     # Parse FASTA
     sequences = parse_fasta(text)
+    if not sequences:
+        return "No valid FASTA sequences found in input.", None, None
     header, seq = sequences[0]
+    # Process sequence
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
+        model = VirusClassifier(256).to(device)
+        model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return f"Error loading model: {str(e)}", None, None
+    # Generate features
+    freq_vector = sequence_to_kmer_vector(seq)
+    scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
+    x_tensor = torch.FloatTensor(scaled_vector).to(device)
+    # Calculate SHAP values and predictions
+    shap_values, human_prob = calculate_shap_values(model, x_tensor)
+    # Generate k-mer information
+    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+    important_indices = np.argsort(np.abs(shap_values))[-top_kmers:]
     important_kmers = []
+    for idx in important_indices:
         important_kmers.append({
+            'kmer': kmers[idx],
+            'impact': shap_values[idx],
+            'frequency': freq_vector[idx] * 100,
+            'significance': scaled_vector[0][idx]
         })
+    # Format results text
+    results = [
+        f"Sequence: {header}",
+        f"Prediction: {'Human' if human_prob > 0.5 else 'Non-human'} Origin",
+        f"Confidence: {max(human_prob, 1-human_prob):.3f}",
+        f"Human Probability: {human_prob:.3f}",
+        "\nTop Contributing k-mers:",
+    ]
+    for kmer in important_kmers:
+        direction = "→ Human" if kmer['impact'] > 0 else "→ Non-human"
+        results.append(
+            f"• {kmer['kmer']}: {direction} "
+            f"(impact: {kmer['impact']:.3f}, "
+            f"freq: {kmer['frequency']:.2f}%)"
         )
+    # Generate plots
+    shap_plot = create_importance_plot(shap_values, kmers, top_kmers)
+    contribution_plot = create_contribution_plot(important_kmers, human_prob)
+    # Convert plots to images
     def fig_to_image(fig):
         buf = io.BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
         buf.seek(0)
+        img = Image.open(buf)
         plt.close(fig)
+        return img
+    return "\n".join(results), fig_to_image(shap_plot), fig_to_image(contribution_plot)
+# Create Gradio interface
+css = """
+.gradio-container {
+    font-family: 'IBM Plex Sans', sans-serif;
+}
+.interpretation-container {
+    margin-top: 20px;
+    padding: 15px;
+    border-radius: 8px;
+    background-color: #f8f9fa;
+}
 """
+with gr.Blocks(css=css) as iface:
+    gr.Markdown("""
+    # Virus Host Classifier
+    This tool predicts whether a viral sequence is likely of human or non-human origin using k-mer frequency analysis.
+    ### Instructions
+    1. Upload a FASTA file or paste your sequence in FASTA format
+    2. Adjust the number of top k-mers to display (default: 10)
+    3. View the prediction results and feature importance visualizations
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_input = gr.File(
+                label="Upload FASTA file",
+                file_types=[".fasta", ".fa", ".txt"]
+            )
+            text_input = gr.Textbox(
+                label="Or paste FASTA sequence",
+                placeholder=">sequence_name\nACGTACGT...",
+                lines=5
+            )
+            top_k = gr.Slider(
+                minimum=5,
+                maximum=20,
+                value=10,
+                step=1,
+                label="Number of top k-mers to display"
+            )
+            submit_btn = gr.Button("Analyze Sequence", variant="primary")
+        with gr.Column(scale=2):
+            results = gr.Textbox(label="Analysis Results", lines=10)
+            shap_plot = gr.Image(label="Feature Importance Plot")
+            contribution_plot = gr.Image(label="Cumulative Contribution Plot")
+    submit_btn.click(
+        predict,
+        inputs=[file_input, top_k, text_input],
+        outputs=[results, shap_plot, contribution_plot]
+    )
+    gr.Markdown("""
+    ### About
+    - Uses 4-mer frequencies as sequence features
+    - Employs SHAP-like values for feature importance interpretation
+    - Visualizes cumulative feature contributions to the final prediction
+    """)
 if __name__ == "__main__":
+    iface.launch()