Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Feb 27

Commit

345980e

verified ·

1 Parent(s): a2f7e81

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -102

app.py CHANGED Viewed

@@ -18,6 +18,8 @@ import tempfile
 import os
 from typing import List, Dict, Tuple, Optional, Any
 import seaborn as sns
 ###############################################################################
 # 1. MODEL DEFINITION
@@ -80,101 +82,55 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     return vec
 ###############################################################################
-# 3. SHAP-VALUE CALCULATION
 ###############################################################################
-import shap
-from sklearn.linear_model import Ridge
 def calculate_shap_values(model, x_tensor):
-    """
-    Calculate SHAP values with three possible methods:
-    1. Try SHAP's GradientExplainer (better for deep models with unsupported layers)
-    2. Fall back to SHAP's KernelExplainer with fixed parameters if #1 fails
-    3. Fall back to original feature ablation method if both SHAP methods fail
-    """
     model.eval()
     device = next(model.parameters()).device
-    # Get human probability for baseline
-    with torch.no_grad():
-        output = model(x_tensor)
-        probs = torch.softmax(output, dim=1)
-        prob_human = probs[0, 1].item()
-    # Try GradientExplainer first (better for neural nets with unsupported ops)
     try:
-        # Create synthetic background data (more samples to avoid errors)
-        background = torch.zeros((20, x_tensor.shape[1]), device=device)
-        for i in range(20):
-            # Add small random noise to avoid singular matrices
-            background[i] = torch.randn_like(x_tensor[0]) * 0.01
-        explainer = shap.GradientExplainer(model, background)
         shap_values_all = explainer.shap_values(x_tensor)
-        # For classification, shap_values is a list of arrays, one for each class
-        # We want the values for the "human" class (index 1)
-        if isinstance(shap_values_all, list) and len(shap_values_all) > 1:
-            shap_values = shap_values_all[1][0].cpu().numpy()
-        else:
-            shap_values = shap_values_all[0].cpu().numpy()
-        print("Using GradientExplainer for SHAP values")
-        return np.array(shap_values), prob_human
     except Exception as e:
-        print(f"GradientExplainer failed: {str(e)}, trying KernelExplainer")
-        try:
-            # Create model wrapper function
-            def model_predict(x):
-                with torch.no_grad():
-                    tensor_x = torch.FloatTensor(x).to(device)
-                    output = model(tensor_x)
-                    probs = torch.softmax(output, dim=1)[:, 1]  # Human probability
-                    return probs.cpu().numpy()
-            # Create more background samples (50 samples with random noise)
-            background = np.zeros((50, x_tensor.shape[1]))
-            for i in range(50):
-                # Small random values to create better background distribution
-                background[i] = np.random.normal(0, 0.01, x_tensor.shape[1])
-            # Force using Ridge regression instead of default LassoLarsIC
-            explainer = shap.KernelExplainer(
-                model_predict,
-                background,
-                link="identity",  # Use raw output, not logit
-                l1_reg="num_features(10)",  # Simplified regularization
-                model_regressor=Ridge(alpha=0.01)  # Use Ridge instead of LassoLarsIC
-            )
-            # Calculate SHAP values with more samples
-            x_numpy = x_tensor.cpu().numpy()
-            shap_values = explainer.shap_values(x_numpy, nsamples=300)
-            print("Using KernelExplainer for SHAP values")
-            return np.array(shap_values), prob_human
-        except Exception as e:
-            print(f"KernelExplainer failed: {str(e)}, falling back to ablation method")
-            # Fall back to original feature ablation method
             with torch.no_grad():
-                shap_values = []
-                x_zeroed = x_tensor.clone()
-                for i in range(x_tensor.shape[1]):
-                    original_val = x_zeroed[0, i].item()
-                    x_zeroed[0, i] = 0.0
-                    output = model(x_zeroed)
-                    probs = torch.softmax(output, dim=1)
-                    prob = probs[0, 1].item()
-                    shap_values.append(prob_human - prob)
-                    x_zeroed[0, i] = original_val
-            print("Using ablation method for SHAP values")
-            return np.array(shap_values), prob_human
 ###############################################################################
 # 4. PER-BASE SHAP AGGREGATION
 ###############################################################################
@@ -1061,28 +1017,10 @@ def prepare_csv_download(data, filename="analysis_results.csv"):
     else:
         raise ValueError("Unsupported data type for CSV download")
-###############################################################################
-# 13. EXAMPLE FASTA LOADER
-###############################################################################
-def load_example_fasta():
-    """Load the example.fasta file contents"""
-    try:
-        with open('example.fasta', 'r') as f:
-            example_text = f.read()
-        return example_text
-    except Exception as e:
-        return f">example_sequence\nACGTACGT...\n\n(Note: Could not load example.fasta: {str(e)})"
 ###############################################################################
 # 14. BUILD GRADIO INTERFACE
 ###############################################################################
-###############################################################################
-# 13. EXAMPLE FASTA LOADER
-###############################################################################
 def load_example_fasta():
     """Load the example.fasta file contents"""
     try:
@@ -1184,10 +1122,10 @@ with gr.Blocks(css=css) as iface:
         **Analyze Gene Features**
         Upload a FASTA file and corresponding gene features file to analyze SHAP values per gene.
         Gene features should be in the format:
-        ```
-        >gene_name [gene=X] [locus_tag=Y] [location=start..end] or [location=complement(start..end)]
         SEQUENCE
-        ```
         The genome viewer will show genes color-coded by their contribution:
         - Red: Genes pushing toward human origin
         - Blue: Genes pushing toward non-human origin

 import os
 from typing import List, Dict, Tuple, Optional, Any
 import seaborn as sns
+import shap
 ###############################################################################
 # 1. MODEL DEFINITION
     return vec
 ###############################################################################
+# 3. SHAP-VALUE (ABLATION) CALCULATION
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
     model.eval()
     device = next(model.parameters()).device
+    # Create background dataset (baseline)
+    background = np.zeros((300, x_tensor.shape[1]))
     try:
+        # Try using DeepExplainer (efficient for neural networks)
+        explainer = shap.DeepExplainer(model, background)
+        # Calculate SHAP values
         shap_values_all = explainer.shap_values(x_tensor)
+        # Get SHAP values for human class (index 1)
+        shap_values = shap_values_all[1][0]
     except Exception as e:
+        print(f"DeepExplainer failed, falling back to KernelExplainer: {str(e)}")
+        # Create model wrapper function
+        def model_predict(x):
             with torch.no_grad():
+                tensor_x = torch.FloatTensor(x).to(device)
+                output = model(tensor_x)
+                probs = torch.softmax(output, dim=1)[:, 1]  # Human probability
+                return probs.cpu().numpy()
+        # Create baseline distribution
+        background = np.zeros((1, x_tensor.shape[1]))
+        # Use KernelExplainer as fallback
+        explainer = shap.KernelExplainer(model_predict, background)
+        # Calculate SHAP values
+        x_numpy = x_tensor.cpu().numpy()
+        shap_values = explainer.shap_values(x_numpy, nsamples=100)
+    # Get human probability
+    with torch.no_grad():
+        output = model(x_tensor)
+        probs = torch.softmax(output, dim=1)
+        prob_human = probs[0, 1].item()
+    return np.array(shap_values), prob_human
 ###############################################################################
 # 4. PER-BASE SHAP AGGREGATION
 ###############################################################################
     else:
         raise ValueError("Unsupported data type for CSV download")
 ###############################################################################
 # 14. BUILD GRADIO INTERFACE
 ###############################################################################
 def load_example_fasta():
     """Load the example.fasta file contents"""
     try:
         **Analyze Gene Features**
         Upload a FASTA file and corresponding gene features file to analyze SHAP values per gene.
         Gene features should be in the format:
+>gene_name [gene=X] [locus_tag=Y] [location=start..end] or [location=complement(start..end)]
         SEQUENCE
         The genome viewer will show genes color-coded by their contribution:
         - Red: Genes pushing toward human origin
         - Blue: Genes pushing toward non-human origin