Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 11

Commit

6c88c65

verified ·

1 Parent(s): b0fba50

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -153

app.py CHANGED Viewed

@@ -4,10 +4,8 @@ import joblib
 import numpy as np
 from itertools import product
 import torch.nn as nn
-import shap
 import matplotlib.pyplot as plt
 import io
-import json
 from PIL import Image
 class VirusClassifier(nn.Module):
@@ -31,16 +29,16 @@ class VirusClassifier(nn.Module):
         return self.network(x)
     def get_feature_importance(self, x):
-        """Calculate feature importance using gradient-based method for the human class (index 1)"""
         x.requires_grad_(True)
         output = self.network(x)
         probs = torch.softmax(output, dim=1)
-        # We focus on the human class (index 1) probability
         human_prob = probs[..., 1]
         human_prob.backward()
-        # The gradient shows how each feature affects the human probability
         importance = x.grad
         return importance, float(human_prob)
@@ -82,6 +80,94 @@ def parse_fasta(text):
         sequences.append((current_header, ''.join(current_sequence)))
     return sequences
 def predict(file_obj):
     if file_obj is None:
         return "Please upload a FASTA file", None
@@ -119,172 +205,64 @@ def predict(file_obj):
         kmer_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
         X_tensor = torch.FloatTensor(kmer_vector).to(device)
-        # Calculate final probabilities first
         with torch.no_grad():
             output = model(X_tensor)
             probs = torch.softmax(output, dim=1)
-            human_prob = float(probs[0][1])
-        # Get feature importance using integrated gradients
-        baseline = torch.zeros_like(X_tensor)  # baseline of zeros
-        steps = 50
-        all_importance = []
-        for i in range(steps + 1):
-            alpha = i / steps
-            interpolated = baseline + alpha * (X_tensor - baseline)
-            interpolated.requires_grad_(True)
-            output = model(interpolated)
-            probs = torch.softmax(output, dim=1)
-            human_class = probs[..., 1]
-            if interpolated.grad is not None:
-                interpolated.grad.zero_()
-            human_class.backward()
-            all_importance.append(interpolated.grad.cpu().numpy())
-        # Average the gradients
-        kmer_importance = np.mean(all_importance, axis=0)[0]
-        # Scale to match probability difference
-        target_diff = human_prob - 0.5  # difference from neutral prediction
-        current_sum = np.sum(kmer_importance)
-        if current_sum != 0:  # avoid division by zero
-            kmer_importance = kmer_importance * (target_diff / current_sum)
-        # Get top k-mers by absolute importance
         top_k = 10
         top_indices = np.argsort(np.abs(kmer_importance))[-top_k:][::-1]
-        important_kmers = [
-            {
-                'kmer': list(kmer_dict.keys())[list(kmer_dict.values()).index(i)],
-                'importance': float(kmer_importance[i]),
-                'frequency': float(raw_freq_vector[i]),
-                'scaled': float(kmer_vector[0][i])
-            }
-            for i in top_indices
-        ]
-        # Prepare data for SHAP waterfall plot
-        top_features = [item['kmer'] for item in important_kmers]
-        top_values = [item['importance'] for item in important_kmers]
-        # Calculate the impact of remaining features
-        others_mask = np.ones_like(kmer_importance, dtype=bool)
-        others_mask[top_indices] = False
-        others_sum = np.sum(kmer_importance[others_mask])
-        top_features.append("Others")
-        top_values.append(others_sum)
-        # Calculate final probabilities first
-        with torch.no_grad():
-            output = model(X_tensor)
-            probs = torch.softmax(output, dim=1)
-            human_prob = float(probs[0][1])
-        # Create SHAP explanation
-        # We'll use the actual probabilities for alignment
-        explanation = shap.Explanation(
-            values=np.array(top_values),
-            base_values=0.5,  # Start from neutral prediction
-            data=np.array([
-                raw_freq_vector[kmer_dict[feat]] if feat != "Others"
-                else np.sum(raw_freq_vector[others_mask])
-                for feat in top_features
-            ]),
-            feature_names=top_features
-        )
-        explanation.expected_value = 0.5  # Start from neutral prediction
-        # Calculate step-by-step probabilities
-        current_prob = 0.5  # Start at neutral
-        steps = [('Start', current_prob, 0)]
-        # Process each k-mer contribution
-        for kmer in important_kmers:
-            change = kmer['importance']
-            current_prob += change
-            steps.append((kmer['kmer'], current_prob, change))
-        # Add final "Others" contribution
-        current_prob += others_sum
-        steps.append(('Others', current_prob, others_sum))
-        # Create step plot
-        plt.figure(figsize=(12, 6))
-        x = range(len(steps))
-        y = [step[1] for step in steps]
-        # Plot steps
-        plt.step(x, y, 'b-', where='post', label='Probability', linewidth=2)
-        plt.plot(x, y, 'b.', markersize=10)
-        # Add reference line
-        plt.axhline(y=0.5, color='r', linestyle='--', label='Neutral (0.5)')
-        # Customize plot
-        plt.grid(True, linestyle='--', alpha=0.7)
-        plt.ylim(0, 1)
-        plt.ylabel('Human Probability')
-        plt.title(f'K-mer Contributions to Prediction (final prob: {human_prob:.3f})')
-        # Add labels for each point
-        for i, (kmer, prob, change) in enumerate(steps):
-            # Add k-mer label
-            plt.annotate(kmer,
-                        (i, prob),
-                        xytext=(0, 10 if i % 2 == 0 else -20),  # Alternate up/down
-                        textcoords='offset points',
-                        ha='center',
-                        rotation=45 if len(kmer) > 5 else 0)
-            # Add change value
-            if i > 0:  # Skip first point (Start)
-                change_text = f'{change:+.3f}'
-                color = 'green' if change > 0 else 'red'
-                plt.annotate(change_text,
-                           (i, prob),
-                           xytext=(0, -20 if i % 2 == 0 else 10),
-                           textcoords='offset points',
-                           ha='center',
-                           color=color)
-        plt.legend()
-        plt.tight_layout()
-        # Save plot
-        buf = io.BytesIO()
-        plt.savefig(buf, format='png', bbox_inches='tight', dpi=300)
-        buf.seek(0)
-        plot_image = Image.open(buf)
-        plt.close()
-        # Calculate final probabilities
-        with torch.no_grad():
-            output = model(X_tensor)
-            probs = torch.softmax(output, dim=1)
         pred_class = 1 if probs[0][1] > probs[0][0] else 0
         pred_label = 'human' if pred_class == 1 else 'non-human'
-        # Generate results text
-        results_text += f"""Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
-Human probability: {float(probs[0][1]):0.4f}
 Non-human probability: {float(probs[0][0]):0.4f}
 Most influential k-mers (ranked by importance):"""
         for kmer in important_kmers:
-            direction = "human" if kmer['importance'] > 0 else "non-human"
             results_text += f"\n  {kmer['kmer']}: "
-            results_text += f"pushes toward {direction} (impact={abs(kmer['importance']):.4f}), "
-            results_text += f"occurrence={kmer['frequency']*100:.2f}% of sequence "
-            if kmer['scaled'] > 0:
-                results_text += f"(appears {abs(kmer['scaled']):.2f}σ more than average)"
-            else:
-                results_text += f"(appears {abs(kmer['scaled']):.2f}σ less than average)"
     except Exception as e:
         return f"Error processing sequences: {str(e)}", None
@@ -294,7 +272,10 @@ Most influential k-mers (ranked by importance):"""
 iface = gr.Interface(
     fn=predict,
     inputs=gr.File(label="Upload FASTA file", type="binary"),
-    outputs=[gr.Textbox(label="Results"), gr.Image(label="SHAP Waterfall Plot")],
     title="Virus Host Classifier"
 )

 import numpy as np
 from itertools import product
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import io
 from PIL import Image
 class VirusClassifier(nn.Module):
         return self.network(x)
     def get_feature_importance(self, x):
+        """Calculate feature importance using gradient-based method"""
         x.requires_grad_(True)
         output = self.network(x)
         probs = torch.softmax(output, dim=1)
+        # Get importance for human class (index 1)
         human_prob = probs[..., 1]
+        if x.grad is not None:
+            x.grad.zero_()
         human_prob.backward()
         importance = x.grad
         return importance, float(human_prob)
         sequences.append((current_header, ''.join(current_sequence)))
     return sequences
+def create_visualization(important_kmers, human_prob, title):
+    """Create a comprehensive visualization of k-mer impacts"""
+    fig = plt.figure(figsize=(15, 10))
+    # Create grid for subplots
+    gs = plt.GridSpec(2, 1, height_ratios=[1.5, 1], hspace=0.3)
+    # 1. Probability Step Plot
+    ax1 = plt.subplot(gs[0])
+    current_prob = 0.5
+    steps = [('Start', current_prob, 0)]
+    for kmer in important_kmers:
+        change = kmer['impact'] * (-1 if kmer['direction'] == 'non-human' else 1)
+        current_prob += change
+        steps.append((kmer['kmer'], current_prob, change))
+    x = range(len(steps))
+    y = [step[1] for step in steps]
+    # Plot steps
+    ax1.step(x, y, 'b-', where='post', label='Probability', linewidth=2)
+    ax1.plot(x, y, 'b.', markersize=10)
+    # Add reference line
+    ax1.axhline(y=0.5, color='r', linestyle='--', label='Neutral (0.5)')
+    # Customize plot
+    ax1.grid(True, linestyle='--', alpha=0.7)
+    ax1.set_ylim(0, 1)
+    ax1.set_ylabel('Human Probability')
+    ax1.set_title(f'K-mer Contributions to Prediction (final prob: {human_prob:.3f})')
+    # Add labels for each point
+    for i, (kmer, prob, change) in enumerate(steps):
+        # Add k-mer label
+        ax1.annotate(kmer,
+                    (i, prob),
+                    xytext=(0, 10 if i % 2 == 0 else -20),
+                    textcoords='offset points',
+                    ha='center',
+                    rotation=45)
+        # Add change value
+        if i > 0:
+            change_text = f'{change:+.3f}'
+            color = 'green' if change > 0 else 'red'
+            ax1.annotate(change_text,
+                       (i, prob),
+                       xytext=(0, -20 if i % 2 == 0 else 10),
+                       textcoords='offset points',
+                       ha='center',
+                       color=color)
+    ax1.legend()
+    # 2. K-mer Frequency and Sigma Plot
+    ax2 = plt.subplot(gs[1])
+    # Prepare data
+    kmers = [k['kmer'] for k in important_kmers]
+    frequencies = [k['occurrence'] for k in important_kmers]
+    sigmas = [k['sigma'] for k in important_kmers]
+    colors = ['g' if k['direction'] == 'human' else 'r' for k in important_kmers]
+    # Create bar plot for frequencies
+    x = np.arange(len(kmers))
+    width = 0.35
+    ax2.bar(x - width/2, frequencies, width, label='Frequency (%)', color=colors, alpha=0.6)
+    ax2_twin = ax2.twinx()
+    ax2_twin.bar(x + width/2, sigmas, width, label='σ from mean', color=[c if s > 0 else 'gray' for c, s in zip(colors, sigmas)], alpha=0.3)
+    # Customize plot
+    ax2.set_xticks(x)
+    ax2.set_xticklabels(kmers, rotation=45)
+    ax2.set_ylabel('Frequency (%)')
+    ax2_twin.set_ylabel('Standard Deviations (σ) from Mean')
+    ax2.set_title('K-mer Frequencies and Statistical Significance')
+    # Add legends
+    lines1, labels1 = ax2.get_legend_handles_labels()
+    lines2, labels2 = ax2_twin.get_legend_handles_labels()
+    ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
+    plt.tight_layout()
+    return fig
 def predict(file_obj):
     if file_obj is None:
         return "Please upload a FASTA file", None
         kmer_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
         X_tensor = torch.FloatTensor(kmer_vector).to(device)
+        # Get model predictions
         with torch.no_grad():
             output = model(X_tensor)
             probs = torch.softmax(output, dim=1)
+        # Get feature importance
+        importance, _ = model.get_feature_importance(X_tensor)
+        kmer_importance = importance[0].cpu().numpy()
+        # Get top k-mers
         top_k = 10
         top_indices = np.argsort(np.abs(kmer_importance))[-top_k:][::-1]
+        important_kmers = []
+        for idx in top_indices:
+            kmer = list(kmer_dict.keys())[list(kmer_dict.values()).index(idx)]
+            imp = float(abs(kmer_importance[idx]))
+            direction = 'human' if kmer_importance[idx] > 0 else 'non-human'
+            freq = float(raw_freq_vector[idx] * 100)  # Convert to percentage
+            sigma = float(kmer_vector[0][idx])
+            important_kmers.append({
+                'kmer': kmer,
+                'impact': imp,
+                'direction': direction,
+                'occurrence': freq,
+                'sigma': sigma
+            })
+        # Generate text results
         pred_class = 1 if probs[0][1] > probs[0][0] else 0
         pred_label = 'human' if pred_class == 1 else 'non-human'
+        human_prob = float(probs[0][1])
+        results_text = f"""Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
+Human probability: {human_prob:0.4f}
 Non-human probability: {float(probs[0][0]):0.4f}
 Most influential k-mers (ranked by importance):"""
         for kmer in important_kmers:
             results_text += f"\n  {kmer['kmer']}: "
+            results_text += f"pushes toward {kmer['direction']} (impact={kmer['impact']:.4f}), "
+            results_text += f"occurrence={kmer['occurrence']:.2f}% of sequence "
+            results_text += f"(appears {abs(kmer['sigma']):.2f}σ "
+            results_text += "more" if kmer['sigma'] > 0 else "less"
+            results_text += " than average)"
+        # Create visualization
+        fig = create_visualization(important_kmers, human_prob, header)
+        # Save plot
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight', dpi=300)
+        buf.seek(0)
+        plot_image = Image.open(buf)
+        plt.close(fig)
     except Exception as e:
         return f"Error processing sequences: {str(e)}", None
 iface = gr.Interface(
     fn=predict,
     inputs=gr.File(label="Upload FASTA file", type="binary"),
+    outputs=[
+        gr.Textbox(label="Results"),
+        gr.Image(label="K-mer Analysis Visualization")
+    ],
     title="Virus Host Classifier"
 )