Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 11

Commit

6a3b036

verified ·

1 Parent(s): 0eb9745

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -32

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torch.nn as nn
 import shap
 import matplotlib.pyplot as plt
 import io
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
@@ -44,20 +45,15 @@ class VirusClassifier(nn.Module):
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """Convert sequence to k-mer frequency vector"""
-    # Generate all possible k-mers
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
-    # Initialize vector
     vec = np.zeros(len(kmers), dtype=np.float32)
-    # Count k-mers
     for i in range(len(sequence) - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
             vec[kmer_dict[kmer]] += 1
-    # Convert to frequencies
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
         vec = vec / total_kmers
@@ -88,7 +84,6 @@ def predict(file_obj):
     if file_obj is None:
         return "Please upload a FASTA file", None
-    # Read the file content
     try:
         if isinstance(file_obj, str):
             text = file_obj
@@ -97,43 +92,31 @@ def predict(file_obj):
     except Exception as e:
         return f"Error reading file: {str(e)}", None
-    # Generate k-mer dictionary
-    k = 4  # k-mer size
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
-    # Load model and scaler
     try:
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        model = VirusClassifier(256).to(device)  # k=4 -> 4^4 = 256 features
-        # Load model with explicit map_location
         state_dict = torch.load('model.pt', map_location=device)
         model.load_state_dict(state_dict)
-        # Load scaler
         scaler = joblib.load('scaler.pkl')
-        # Set model to evaluation mode
         model.eval()
     except Exception as e:
         return f"Error loading model: {str(e)}", None
-    # Initialize variables to store results and plot
     results_text = ""
     plot_image = None
     try:
         sequences = parse_fasta(text)
-        # For simplicity, process only the first sequence for plotting
         header, seq = sequences[0]
-        # Get raw frequency vector and scaled vector
         raw_freq_vector = sequence_to_kmer_vector(seq)
         kmer_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
         X_tensor = torch.FloatTensor(kmer_vector).to(device)
-        # Get predictions and feature importance
         with torch.no_grad():
             output = model(X_tensor)
             probs = torch.softmax(output, dim=1)
@@ -141,11 +124,9 @@ def predict(file_obj):
         importance = model.get_feature_importance(X_tensor)
         kmer_importance = importance[0].cpu().numpy()
-        # Normalize importance scores to original scale
         if np.max(np.abs(kmer_importance)) != 0:
             kmer_importance = kmer_importance / np.max(np.abs(kmer_importance)) * 0.002
-        # Get top 10 k-mers based on absolute importance
         top_k = 10
         top_indices = np.argsort(np.abs(kmer_importance))[-top_k:][::-1]
         important_kmers = [
@@ -158,11 +139,9 @@ def predict(file_obj):
             for i in top_indices
         ]
-        # Prepare SHAP-like values for waterfall plot
         top_features = [item['kmer'] for item in important_kmers]
         top_values = [item['importance'] for item in important_kmers]
-        # Combine the rest of the features into an "Others" category
         others_mask = np.ones_like(kmer_importance, dtype=bool)
         others_mask[top_indices] = False
         others_sum = np.sum(kmer_importance[others_mask])
@@ -176,19 +155,15 @@ def predict(file_obj):
             data=np.array([raw_freq_vector[kmer_dict[feat]] if feat != "Others" else np.sum(raw_freq_vector[others_mask]) for feat in top_features]),
             feature_names=top_features
         )
-        # Manually set expected_value to satisfy waterfall_legacy requirements
         explanation.expected_value = 0
-        # Generate waterfall plot using SHAP's legacy function
         fig = shap.plots._waterfall.waterfall_legacy(explanation, show=False)
-        # Save plot to a bytes buffer
         buf = io.BytesIO()
         fig.savefig(buf, format='png')
         buf.seek(0)
-        plot_image = buf
-        # Format textual results for the first sequence
         pred_class = 1 if probs[0][1] > probs[0][0] else 0
         pred_label = 'human' if pred_class == 1 else 'non-human'
@@ -213,7 +188,6 @@ Most influential k-mers (ranked by importance):"""
     return results_text, plot_image
-# Create the interface with two outputs: Textbox and Image
 iface = gr.Interface(
     fn=predict,
     inputs=gr.File(label="Upload FASTA file", type="binary"),
@@ -221,6 +195,5 @@ iface = gr.Interface(
     title="Virus Host Classifier"
 )
-# Launch the interface
 if __name__ == "__main__":
-    iface.launch()

 import shap
 import matplotlib.pyplot as plt
 import io
+from PIL import Image  # Import PIL for image handling
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """Convert sequence to k-mer frequency vector"""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
     for i in range(len(sequence) - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
             vec[kmer_dict[kmer]] += 1
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
         vec = vec / total_kmers
     if file_obj is None:
         return "Please upload a FASTA file", None
     try:
         if isinstance(file_obj, str):
             text = file_obj
     except Exception as e:
         return f"Error reading file: {str(e)}", None
+    k = 4
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     try:
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        model = VirusClassifier(256).to(device)
         state_dict = torch.load('model.pt', map_location=device)
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
         model.eval()
     except Exception as e:
         return f"Error loading model: {str(e)}", None
     results_text = ""
     plot_image = None
     try:
         sequences = parse_fasta(text)
         header, seq = sequences[0]
         raw_freq_vector = sequence_to_kmer_vector(seq)
         kmer_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
         X_tensor = torch.FloatTensor(kmer_vector).to(device)
         with torch.no_grad():
             output = model(X_tensor)
             probs = torch.softmax(output, dim=1)
         importance = model.get_feature_importance(X_tensor)
         kmer_importance = importance[0].cpu().numpy()
         if np.max(np.abs(kmer_importance)) != 0:
             kmer_importance = kmer_importance / np.max(np.abs(kmer_importance)) * 0.002
         top_k = 10
         top_indices = np.argsort(np.abs(kmer_importance))[-top_k:][::-1]
         important_kmers = [
             for i in top_indices
         ]
         top_features = [item['kmer'] for item in important_kmers]
         top_values = [item['importance'] for item in important_kmers]
         others_mask = np.ones_like(kmer_importance, dtype=bool)
         others_mask[top_indices] = False
         others_sum = np.sum(kmer_importance[others_mask])
             data=np.array([raw_freq_vector[kmer_dict[feat]] if feat != "Others" else np.sum(raw_freq_vector[others_mask]) for feat in top_features]),
             feature_names=top_features
         )
         explanation.expected_value = 0
         fig = shap.plots._waterfall.waterfall_legacy(explanation, show=False)
         buf = io.BytesIO()
         fig.savefig(buf, format='png')
         buf.seek(0)
+        plot_image = Image.open(buf)  # Convert BytesIO to PIL Image
         pred_class = 1 if probs[0][1] > probs[0][0] else 0
         pred_label = 'human' if pred_class == 1 else 'non-human'
     return results_text, plot_image
 iface = gr.Interface(
     fn=predict,
     inputs=gr.File(label="Upload FASTA file", type="binary"),
     title="Virus Host Classifier"
 )
 if __name__ == "__main__":
+    iface.launch(share=True)  # Set share=True to create a public link