Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

82425ee

verified ·

1 Parent(s): 05b9733

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -90

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ import matplotlib.colors as mcolors
 import io
 from PIL import Image
 from scipy.interpolate import interp1d
-import numpy as np
 ###############################################################################
 # 1. MODEL DEFINITION
@@ -317,91 +316,90 @@ def compute_gc_content(sequence):
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
-# 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
-def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
-    Compare two sequences by analyzing their SHAP differences.
-    Returns comparison text and visualizations.
     """
-    # Process first sequence
-    results1 = analyze_sequence(file1, fasta_text=fasta1)
-    if isinstance(results1[0], str) and "Error" in results1[0]:
-        return (f"Error in sequence 1: {results1[0]}", None, None)
-    # Process second sequence
-    results2 = analyze_sequence(file2, fasta_text=fasta2)
-    if isinstance(results2[0], str) and "Error" in results2[0]:
-        return (f"Error in sequence 2: {results2[0]}", None, None)
-    # Get SHAP means from state dictionaries
-    shap1 = results1[3]["shap_means"]
-    shap2 = results2[3]["shap_means"]
-    # Normalize lengths
-    shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
-    # Compute difference (positive = seq2 more human-like)
-    shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
-    # Calculate some statistics
-    avg_diff = np.mean(shap_diff)
-    std_diff = np.std(shap_diff)
-    max_diff = np.max(shap_diff)
-    min_diff = np.min(shap_diff)
-    # Calculate what fraction of positions show substantial differences
-    threshold = 0.05  # Arbitrary threshold for "substantial" difference
-    substantial_diffs = np.abs(shap_diff) > threshold
-    frac_different = np.mean(substantial_diffs)
-    # Generate comparison text
-    # Extract classifications without using split on newline
-    classification1 = results1[0].split('Classification: ')[1].split('(')[0].strip()
-    classification2 = results2[0].split('Classification: ')[1].split('(')[0].strip()
-    # Build the text using format method
-    comparison_text = (
-        "Sequence Comparison Results:\n"
-        "Sequence 1: {}\n"
-        "Length: {:,} bases\n"
-        "Classification: {}\n\n"
-        "Sequence 2: {}\n"
-        "Length: {:,} bases\n"
-        "Classification: {}\n\n"
-        "Comparison Statistics:\n"
-        "Average SHAP difference: {:.4f}\n"
-        "Standard deviation: {:.4f}\n"
-        "Max difference: {:.4f} (Seq2 more human-like)\n"
-        "Min difference: {:.4f} (Seq1 more human-like)\n"
-        "Fraction of positions with substantial differences: {:.2%}\n\n"
-        "Interpretation:\n"
-        "Positive values (red) indicate regions where Sequence 2 is more human-like\n"
-        "Negative values (blue) indicate regions where Sequence 1 is more human-like"
-    ).format(
-        results1[4], len(shap1), classification1,
-        results2[4], len(shap2), classification2,
-        avg_diff, std_diff, max_diff, min_diff, frac_different
-    )
-    # Create comparison heatmap
-    heatmap_fig = plot_comparative_heatmap(shap_diff)
-    heatmap_img = fig_to_image(heatmap_fig)
-    # Create histogram of differences
-    hist_fig = plot_shap_histogram(
-        shap_diff,
-        title="Distribution of SHAP Differences"
-    )
-    hist_img = fig_to_image(hist_fig)
-    return comparison_text, heatmap_img, hist_img
-###############################################################################
-# 8. SUBREGION ANALYSIS (Gradio Step 2)
-###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
     """
@@ -468,9 +466,8 @@ def analyze_subregion(state, header, region_start, region_end):
     return (region_info, heatmap_img, hist_img)
 ###############################################################################
-# NEW SECTION: COMPARATIVE ANALYSIS FUNCTIONS
 ###############################################################################
 def normalize_shap_lengths(shap1, shap2, num_points=1000):
@@ -583,7 +580,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     # Compute difference (positive = seq2 more human-like)
     shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
-    # Calculate some statistics
     avg_diff = np.mean(shap_diff)
     std_diff = np.std(shap_diff)
     max_diff = np.max(shap_diff)
@@ -594,7 +591,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     substantial_diffs = np.abs(shap_diff) > threshold
     frac_different = np.mean(substantial_diffs)
-    # Extract classifications safely without using f-strings with backslashes
     classification1 = results1[0].split('Classification: ')[1].split('\n')[0].strip()
     classification2 = results2[0].split('Classification: ')[1].split('\n')[0].strip()
@@ -603,7 +600,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     len2_formatted = "{:,}".format(len(shap2))
     frac_formatted = "{:.2%}".format(frac_different)
-    # Build comparison text without f-strings containing backslashes
     comparison_text = (
         "Sequence Comparison Results:\n"
         f"Sequence 1: {results1[4]}\n"
@@ -635,7 +632,7 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     hist_img = fig_to_image(hist_fig)
     return comparison_text, heatmap_img, hist_img
 ###############################################################################
 # 9. BUILD GRADIO INTERFACE
 ###############################################################################
@@ -694,7 +691,6 @@ with gr.Blocks(css=css) as iface:
         seq_state = gr.State()
         header_state = gr.State()
-        # analyze_sequence(...) returns 5 items
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
@@ -781,6 +777,7 @@ with gr.Blocks(css=css) as iface:
             inputs=[file_input1, file_input2, text_input1, text_input2],
             outputs=[comparison_text, diff_heatmap, diff_hist]
         )
     gr.Markdown("""
     ### Interface Features
     - **Overall Classification** (human vs non-human) using k-mer frequencies.
@@ -793,7 +790,28 @@ with gr.Blocks(css=css) as iface:
       - GC content
       - Fraction of positions pushing human vs. non-human
       - Simple logic-based classification
     """)
 if __name__ == "__main__":
-    iface.launch()

 import io
 from PIL import Image
 from scipy.interpolate import interp1d
 ###############################################################################
 # 1. MODEL DEFINITION
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
+# 7. SEQUENCE ANALYSIS FUNCTIONS
 ###############################################################################
+def analyze_sequence(file_path, top_k=10, fasta_text="", window_size=500):
     """
+    Analyze a virus sequence from a FASTA file or text input.
+    Returns (results_text, kmer_plot, heatmap_plot, state_dict, header)
     """
+    try:
+        # Load model and k-mer info
+        model = VirusClassifier(256)  # 4^4 = 256 k-mers for k=4
+        model.load_state_dict(torch.load("model.pt"))
+        model.eval()
+        kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+        # Process input (file takes precedence over text)
+        if file_path:
+            with open(file_path, 'r') as f:
+                fasta_text = f.read()
+        if not fasta_text.strip():
+            return ("Error: No sequence provided", None, None, {}, "")
+        # Parse FASTA
+        sequences = parse_fasta(fasta_text)
+        if not sequences:
+            return ("Error: No valid FASTA sequences found", None, None, {}, "")
+        header, sequence = sequences[0]  # Take first sequence
+        # Convert to k-mer frequencies
+        x = sequence_to_kmer_vector(sequence)
+        x_tensor = torch.tensor(x).float().unsqueeze(0)
+        # Get model prediction
+        with torch.no_grad():
+            output = model(x_tensor)
+            probs = torch.softmax(output, dim=1)
+            pred_human = probs[0, 1].item()
+        # Calculate SHAP values
+        shap_values, prob = calculate_shap_values(model, x_tensor)
+        # Find most extreme regions
+        shap_means = compute_positionwise_scores(sequence, shap_values)
+        start_max, end_max, avg_max = find_extreme_subregion(shap_means, window_size, mode="max")
+        start_min, end_min, avg_min = find_extreme_subregion(shap_means, window_size, mode="min")
+        # Format results text
+        classification = "Human" if pred_human > 0.5 else "Non-human"
+        results = (
+            f"Classification: {classification} "
+            f"(probability of human = {pred_human:.3f})\n\n"
+            f"Sequence length: {len(sequence):,} bases\n"
+            f"Overall GC content: {compute_gc_content(sequence):.1f}%\n\n"
+            f"Most human-like {window_size}bp region:\n"
+            f"Position {start_max:,} to {end_max:,}\n"
+            f"Average SHAP: {avg_max:.4f}\n"
+            f"GC content: {compute_gc_content(sequence[start_max:end_max]):.1f}%\n\n"
+            f"Least human-like {window_size}bp region:\n"
+            f"Position {start_min:,} to {end_min:,}\n"
+            f"Average SHAP: {avg_min:.4f}\n"
+            f"GC content: {compute_gc_content(sequence[start_min:end_min]):.1f}%"
+        )
+        # Create k-mer importance plot
+        kmer_fig = create_importance_bar_plot(shap_values, kmers, top_k)
+        kmer_img = fig_to_image(kmer_fig)
+        # Create genome-wide heatmap
+        heatmap_fig = plot_linear_heatmap(shap_means)
+        heatmap_img = fig_to_image(heatmap_fig)
+        # Store data for subregion analysis
+        state = {
+            "seq": sequence,
+            "shap_means": shap_means
+        }
+        return results, kmer_img, heatmap_img, state, header
+    except Exception as e:
+        error_msg = f"Error analyzing sequence: {str(e)}"
+        return (error_msg, None, None, {}, "")
 def analyze_subregion(state, header, region_start, region_end):
     """
     return (region_info, heatmap_img, hist_img)
 ###############################################################################
+# 8. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
 def normalize_shap_lengths(shap1, shap2, num_points=1000):
     # Compute difference (positive = seq2 more human-like)
     shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
+    # Calculate statistics
     avg_diff = np.mean(shap_diff)
     std_diff = np.std(shap_diff)
     max_diff = np.max(shap_diff)
     substantial_diffs = np.abs(shap_diff) > threshold
     frac_different = np.mean(substantial_diffs)
+    # Extract classifications safely
     classification1 = results1[0].split('Classification: ')[1].split('\n')[0].strip()
     classification2 = results2[0].split('Classification: ')[1].split('\n')[0].strip()
     len2_formatted = "{:,}".format(len(shap2))
     frac_formatted = "{:.2%}".format(frac_different)
+    # Build comparison text
     comparison_text = (
         "Sequence Comparison Results:\n"
         f"Sequence 1: {results1[4]}\n"
     hist_img = fig_to_image(hist_fig)
     return comparison_text, heatmap_img, hist_img
 ###############################################################################
 # 9. BUILD GRADIO INTERFACE
 ###############################################################################
         seq_state = gr.State()
         header_state = gr.State()
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
             inputs=[file_input1, file_input2, text_input1, text_input2],
             outputs=[comparison_text, diff_heatmap, diff_hist]
         )
     gr.Markdown("""
     ### Interface Features
     - **Overall Classification** (human vs non-human) using k-mer frequencies.
       - GC content
       - Fraction of positions pushing human vs. non-human
       - Simple logic-based classification
+    - **Sequence Comparison**:
+      - Compare two sequences to identify regions of difference
+      - Normalized comparison to handle different sequence lengths
+      - Statistical summary of differences
     """)
+###############################################################################
+# 10. MAIN EXECUTION
+###############################################################################
 if __name__ == "__main__":
+    # Set up any global configurations if needed
+    plt.style.use('default')
+    plt.rcParams['figure.figsize'] = [10, 6]
+    plt.rcParams['figure.dpi'] = 100
+    plt.rcParams['font.size'] = 10
+    # Launch the interface
+    iface.launch(
+        share=False,              # Set to True to create a public link
+        server_name="0.0.0.0",   # Listen on all network interfaces
+        server_port=7860,        # Default Gradio port
+        show_api=False,          # Hide API docs
+        debug=False              # Set to True for debugging
+    )