Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

90c03ec

verified ·

1 Parent(s): c5accc7

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -81

app.py CHANGED Viewed

@@ -320,92 +320,84 @@ def compute_gc_content(sequence):
 # 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
-def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     """
-    Analyzes the entire genome, returning classification, full-genome heatmap,
-    top k-mer bar plot, and identifies subregions with strongest positive/negative push.
     """
-    # Handle input
-    if fasta_text.strip():
-        text = fasta_text.strip()
-    elif file_obj is not None:
-        try:
-            with open(file_obj, 'r') as f:
-                text = f.read()
-        except Exception as e:
-            return (f"Error reading file: {str(e)}", None, None, None, None)
-    else:
-        return ("Please provide a FASTA sequence.", None, None, None, None)
-    # Parse FASTA
-    sequences = parse_fasta(text)
-    if not sequences:
-        return ("No valid FASTA sequences found.", None, None, None, None)
-    header, seq = sequences[0]
-    # Load model and scaler
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    try:
-        # Use weights_only=True for safer loading
-        state_dict = torch.load('model.pt', map_location=device, weights_only=True)
-        model = VirusClassifier(256).to(device)
-        model.load_state_dict(state_dict)
-        scaler = joblib.load('scaler.pkl')
-    except Exception as e:
-        return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
-    # Vectorize + scale
-    freq_vector = sequence_to_kmer_vector(seq)
-    scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
-    x_tensor = torch.FloatTensor(scaled_vector).to(device)
-    # SHAP + classification
-    shap_values, prob_human = calculate_shap_values(model, x_tensor)
-    prob_nonhuman = 1.0 - prob_human
-    classification = "Human" if prob_human > 0.5 else "Non-human"
-    confidence = max(prob_human, prob_nonhuman)
-    # Per-base SHAP
-    shap_means = compute_positionwise_scores(seq, shap_values, k=4)
-    # Find the most "human-pushing" region
-    (max_start, max_end, max_avg) = find_extreme_subregion(shap_means, window_size, mode="max")
-    # Find the most "non-human–pushing" region
-    (min_start, min_end, min_avg) = find_extreme_subregion(shap_means, window_size, mode="min")
-    # Build results text
-    results_text = (
-        f"Sequence: {header}\n"
-        f"Length: {len(seq):,} bases\n"
-        f"Classification: {classification}\n"
-        f"Confidence: {confidence:.3f}\n"
-        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
-        f"---\n"
-        f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
-        f"Start: {max_start}, End: {max_end}, Avg SHAP: {max_avg:.4f}\n\n"
-        f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
-        f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
-    # K-mer importance plot
-    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
-    bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
-    bar_img = fig_to_image(bar_fig)
-    # Full-genome SHAP heatmap
-    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
-    # Store data for subregion analysis
-    state_dict_out = {
-        "seq": seq,
-        "shap_means": shap_means
-    }
-    return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
 # 8. SUBREGION ANALYSIS (Gradio Step 2)

 # 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
+def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
+    Compare two sequences by analyzing their SHAP differences.
+    Returns comparison text and visualizations.
     """
+    # Process first sequence
+    results1 = analyze_sequence(file1, fasta_text=fasta1)
+    if isinstance(results1[0], str) and "Error" in results1[0]:
+        return (f"Error in sequence 1: {results1[0]}", None, None)
+    # Process second sequence
+    results2 = analyze_sequence(file2, fasta_text=fasta2)
+    if isinstance(results2[0], str) and "Error" in results2[0]:
+        return (f"Error in sequence 2: {results2[0]}", None, None)
+    # Get SHAP means from state dictionaries
+    shap1 = results1[3]["shap_means"]
+    shap2 = results2[3]["shap_means"]
+    # Normalize lengths
+    shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
+    # Compute difference (positive = seq2 more human-like)
+    shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
+    # Calculate some statistics
+    avg_diff = np.mean(shap_diff)
+    std_diff = np.std(shap_diff)
+    max_diff = np.max(shap_diff)
+    min_diff = np.min(shap_diff)
+    # Calculate what fraction of positions show substantial differences
+    threshold = 0.05  # Arbitrary threshold for "substantial" difference
+    substantial_diffs = np.abs(shap_diff) > threshold
+    frac_different = np.mean(substantial_diffs)
+    # Generate comparison text
+    # Extract classifications without using split on newline
+    classification1 = results1[0].split('Classification: ')[1].split('(')[0].strip()
+    classification2 = results2[0].split('Classification: ')[1].split('(')[0].strip()
+    # Build the text using format method
+    comparison_text = (
+        "Sequence Comparison Results:\n"
+        "Sequence 1: {}\n"
+        "Length: {:,} bases\n"
+        "Classification: {}\n\n"
+        "Sequence 2: {}\n"
+        "Length: {:,} bases\n"
+        "Classification: {}\n\n"
+        "Comparison Statistics:\n"
+        "Average SHAP difference: {:.4f}\n"
+        "Standard deviation: {:.4f}\n"
+        "Max difference: {:.4f} (Seq2 more human-like)\n"
+        "Min difference: {:.4f} (Seq1 more human-like)\n"
+        "Fraction of positions with substantial differences: {:.2%}\n\n"
+        "Interpretation:\n"
+        "Positive values (red) indicate regions where Sequence 2 is more human-like\n"
+        "Negative values (blue) indicate regions where Sequence 1 is more human-like"
+    ).format(
+        results1[4], len(shap1), classification1,
+        results2[4], len(shap2), classification2,
+        avg_diff, std_diff, max_diff, min_diff, frac_different
     )
+    # Create comparison heatmap
+    heatmap_fig = plot_comparative_heatmap(shap_diff)
     heatmap_img = fig_to_image(heatmap_fig)
+    # Create histogram of differences
+    hist_fig = plot_shap_histogram(
+        shap_diff,
+        title="Distribution of SHAP Differences"
+    )
+    hist_img = fig_to_image(hist_fig)
+    return comparison_text, heatmap_img, hist_img
 ###############################################################################
 # 8. SUBREGION ANALYSIS (Gradio Step 2)