Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 11

Commit

4a7c026

verified ·

1 Parent(s): 2897f12

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -57

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ import joblib
 import numpy as np
 from itertools import product
 import torch.nn as nn
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
@@ -83,7 +86,7 @@ def parse_fasta(text):
 def predict(file_obj):
     if file_obj is None:
-        return "Please upload a FASTA file"
     # Read the file content
     try:
@@ -92,7 +95,7 @@ def predict(file_obj):
         else:
             text = file_obj.decode('utf-8')
     except Exception as e:
-        return f"Error reading file: {str(e)}"
     # Generate k-mer dictionary
     k = 4  # k-mer size
@@ -114,78 +117,108 @@ def predict(file_obj):
         # Set model to evaluation mode
         model.eval()
     except Exception as e:
-        return f"Error loading model: {str(e)}\nFull traceback: {str(e.__traceback__)}"
-    # Get predictions
-    results = []
     try:
         sequences = parse_fasta(text)
-        for header, seq in sequences:
-            # Get raw frequency vector and scaled vector
-            raw_freq_vector = sequence_to_kmer_vector(seq)
-            kmer_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
-            X_tensor = torch.FloatTensor(kmer_vector).to(device)
-            # Get predictions and feature importance
-            with torch.no_grad():
-                output = model(X_tensor)
-                probs = torch.softmax(output, dim=1)
-            # Calculate feature importance
-            importance = model.get_feature_importance(X_tensor)
-            kmer_importance = importance[0].cpu().numpy()
-            # Normalize importance scores to original scale
             kmer_importance = kmer_importance / np.max(np.abs(kmer_importance)) * 0.002
-            # Get top 10 k-mers based on absolute importance
-            top_k = 10
-            top_indices = np.argsort(np.abs(kmer_importance))[-top_k:][::-1]
-            important_kmers = [
-                {
-                    'kmer': list(kmer_dict.keys())[list(kmer_dict.values()).index(i)],
-                    'importance': float(kmer_importance[i]),
-                    'frequency': float(raw_freq_vector[i]),
-                    'scaled': float(kmer_vector[0][i])
-                }
-                for i in top_indices
-            ]
-            # Format results
-            pred_class = 1 if probs[0][1] > probs[0][0] else 0
-            pred_label = 'human' if pred_class == 1 else 'non-human'
-            result = f"""Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
 Human probability: {float(probs[0][1]):0.4f}
 Non-human probability: {float(probs[0][0]):0.4f}
 Most influential k-mers (ranked by importance):"""
-            for kmer in important_kmers:
-                result += f"\n  {kmer['kmer']}: "
-                result += f"impact={kmer['importance']:.4f}, "
-                result += f"occurrence={kmer['frequency']*100:.2f}% of sequence "
-                if kmer['scaled'] > 0:
-                    result += f"(appears {abs(kmer['scaled']):.2f}σ more than average)"
-                else:
-                    result += f"(appears {abs(kmer['scaled']):.2f}σ less than average)"
-            results.append(result)
     except Exception as e:
-        return f"Error processing sequences: {str(e)}"
-    return "\n\n".join(results)
-# Create the interface
 iface = gr.Interface(
     fn=predict,
     inputs=gr.File(label="Upload FASTA file", type="binary"),
-    outputs=gr.Textbox(label="Results"),
     title="Virus Host Classifier"
 )
 # Launch the interface
 if __name__ == "__main__":
-    iface.launch()

 import numpy as np
 from itertools import product
 import torch.nn as nn
+import shap
+import matplotlib.pyplot as plt
+import io
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
 def predict(file_obj):
     if file_obj is None:
+        return "Please upload a FASTA file", None
     # Read the file content
     try:
         else:
             text = file_obj.decode('utf-8')
     except Exception as e:
+        return f"Error reading file: {str(e)}", None
     # Generate k-mer dictionary
     k = 4  # k-mer size
         # Set model to evaluation mode
         model.eval()
     except Exception as e:
+        return f"Error loading model: {str(e)}", None
+    # Initialize variables to store results and plot
+    results_text = ""
+    plot_image = None
     try:
         sequences = parse_fasta(text)
+        # For simplicity, process only the first sequence for plotting
+        header, seq = sequences[0]
+        # Get raw frequency vector and scaled vector
+        raw_freq_vector = sequence_to_kmer_vector(seq)
+        kmer_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
+        X_tensor = torch.FloatTensor(kmer_vector).to(device)
+        # Get predictions and feature importance
+        with torch.no_grad():
+            output = model(X_tensor)
+            probs = torch.softmax(output, dim=1)
+        importance = model.get_feature_importance(X_tensor)
+        kmer_importance = importance[0].cpu().numpy()
+        # Normalize importance scores to original scale
+        if np.max(np.abs(kmer_importance)) != 0:
             kmer_importance = kmer_importance / np.max(np.abs(kmer_importance)) * 0.002
+        # Get top 10 k-mers based on absolute importance
+        top_k = 10
+        top_indices = np.argsort(np.abs(kmer_importance))[-top_k:][::-1]
+        important_kmers = [
+            {
+                'kmer': list(kmer_dict.keys())[list(kmer_dict.values()).index(i)],
+                'importance': float(kmer_importance[i]),
+                'frequency': float(raw_freq_vector[i]),
+                'scaled': float(kmer_vector[0][i])
+            }
+            for i in top_indices
+        ]
+        # Prepare SHAP-like values for waterfall plot
+        top_features = [item['kmer'] for item in important_kmers]
+        top_values = [item['importance'] for item in important_kmers]
+        # Combine the rest of the features into an "Others" category
+        others_mask = np.ones_like(kmer_importance, dtype=bool)
+        others_mask[top_indices] = False
+        others_sum = np.sum(kmer_importance[others_mask])
+        top_features.append("Others")
+        top_values.append(others_sum)
+        explanation = shap.Explanation(
+            values=np.array(top_values),
+            base_values=0,
+            data=np.array([raw_freq_vector[kmer_dict[feat]] if feat != "Others" else np.sum(raw_freq_vector[others_mask]) for feat in top_features]),
+            feature_names=top_features
+        )
+        # Generate waterfall plot using SHAP's legacy function
+        fig = shap.plots._waterfall.waterfall_legacy(explanation, show=False)
+        # Save plot to a bytes buffer
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png')
+        buf.seek(0)
+        plot_image = buf
+        # Format textual results for the first sequence
+        pred_class = 1 if probs[0][1] > probs[0][0] else 0
+        pred_label = 'human' if pred_class == 1 else 'non-human'
+        results_text += f"""Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
 Human probability: {float(probs[0][1]):0.4f}
 Non-human probability: {float(probs[0][0]):0.4f}
 Most influential k-mers (ranked by importance):"""
+        for kmer in important_kmers:
+            results_text += f"\n  {kmer['kmer']}: "
+            results_text += f"impact={kmer['importance']:.4f}, "
+            results_text += f"occurrence={kmer['frequency']*100:.2f}% of sequence "
+            if kmer['scaled'] > 0:
+                results_text += f"(appears {abs(kmer['scaled']):.2f}σ more than average)"
+            else:
+                results_text += f"(appears {abs(kmer['scaled']):.2f}σ less than average)"
     except Exception as e:
+        return f"Error processing sequences: {str(e)}", None
+    return results_text, plot_image
+# Create the interface with two outputs: Textbox and Image
 iface = gr.Interface(
     fn=predict,
     inputs=gr.File(label="Upload FASTA file", type="binary"),
+    outputs=[gr.Textbox(label="Results"), gr.Image(label="SHAP Waterfall Plot")],
     title="Virus Host Classifier"
 )
 # Launch the interface
 if __name__ == "__main__":
+    iface.launch()