Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12

Commit

d1cde92

verified ·

1 Parent(s): 2e254a9

Update app.py

Browse files

Files changed (1) hide show

app.py +223 -0

app.py CHANGED Viewed

@@ -474,7 +474,230 @@ def analyze_subregion(state, header, region_start, region_end):
     return (region_info, heatmap_img, hist_img)
 ###############################################################################
 # 9. BUILD GRADIO INTERFACE
 ###############################################################################

     return (region_info, heatmap_img, hist_img)
+# Add these imports at the top of the file, after existing imports
+from scipy.interpolate import interp1d
+import numpy as np
+###############################################################################
+# NEW SECTION: COMPARATIVE ANALYSIS FUNCTIONS
+###############################################################################
+def normalize_shap_lengths(shap1, shap2, num_points=1000):
+    """
+    Normalize two SHAP arrays to the same length using interpolation.
+    Returns (normalized_shap1, normalized_shap2)
+    """
+    # Create x coordinates for both sequences
+    x1 = np.linspace(0, 1, len(shap1))
+    x2 = np.linspace(0, 1, len(shap2))
+    # Create interpolation functions
+    f1 = interp1d(x1, shap1, kind='linear')
+    f2 = interp1d(x2, shap2, kind='linear')
+    # Create new x coordinates for interpolation
+    x_new = np.linspace(0, 1, num_points)
+    # Interpolate both sequences to new length
+    shap1_norm = f1(x_new)
+    shap2_norm = f2(x_new)
+    return shap1_norm, shap2_norm
+def compute_shap_difference(shap1_norm, shap2_norm):
+    """
+    Compute the difference between two normalized SHAP arrays.
+    Positive values indicate seq2 is more "human-like" than seq1.
+    """
+    return shap2_norm - shap1_norm
+def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
+    """
+    Plot the difference between two sequences' SHAP values.
+    Red indicates seq2 is more human-like, blue indicates seq1 is more human-like.
+    """
+    # Build 2D array for imshow
+    heatmap_data = shap_diff.reshape(1, -1)
+    # Force symmetrical range
+    extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
+    # Create figure with adjusted height ratio
+    fig, ax = plt.subplots(figsize=(12, 1.8))
+    # Create custom colormap
+    custom_cmap = get_zero_centered_cmap()
+    # Plot heatmap
+    cax = ax.imshow(
+        heatmap_data,
+        aspect='auto',
+        cmap=custom_cmap,
+        vmin=-extent,
+        vmax=+extent
+    )
+    # Configure colorbar
+    cbar = plt.colorbar(
+        cax,
+        orientation='horizontal',
+        pad=0.25,
+        aspect=40,
+        shrink=0.8
+    )
+    # Style the colorbar
+    cbar.ax.tick_params(labelsize=8)
+    cbar.set_label(
+        'SHAP Difference (Seq2 - Seq1)',
+        fontsize=9,
+        labelpad=5
+    )
+    # Configure main plot
+    ax.set_yticks([])
+    ax.set_xlabel('Normalized Position (0-100%)', fontsize=10)
+    ax.set_title(title, pad=10)
+    plt.subplots_adjust(
+        bottom=0.25,
+        left=0.05,
+        right=0.95
+    )
+    return fig
+def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
+    """
+    Compare two sequences by analyzing their SHAP differences.
+    Returns comparison text and visualizations.
+    """
+    # Process first sequence
+    results1 = analyze_sequence(file1, fasta_text=fasta1)
+    if isinstance(results1[0], str) and "Error" in results1[0]:
+        return (f"Error in sequence 1: {results1[0]}", None, None)
+    # Process second sequence
+    results2 = analyze_sequence(file2, fasta_text=fasta2)
+    if isinstance(results2[0], str) and "Error" in results2[0]:
+        return (f"Error in sequence 2: {results2[0]}", None, None)
+    # Get SHAP means from state dictionaries
+    shap1 = results1[3]["shap_means"]
+    shap2 = results2[3]["shap_means"]
+    # Normalize lengths
+    shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
+    # Compute difference (positive = seq2 more human-like)
+    shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
+    # Calculate some statistics
+    avg_diff = np.mean(shap_diff)
+    std_diff = np.std(shap_diff)
+    max_diff = np.max(shap_diff)
+    min_diff = np.min(shap_diff)
+    # Calculate what fraction of positions show substantial differences
+    threshold = 0.05  # Arbitrary threshold for "substantial" difference
+    substantial_diffs = np.abs(shap_diff) > threshold
+    frac_different = np.mean(substantial_diffs)
+    # Generate comparison text
+    comparison_text = f"""Sequence Comparison Results:
+Sequence 1: {results1[4]}
+Length: {len(shap1):,} bases
+Classification: {results1[0].split('Classification: ')[1].split('\n')[0]}
+Sequence 2: {results2[4]}
+Length: {len(shap2):,} bases
+Classification: {results2[0].split('Classification: ')[1].split('\n')[0]}
+Comparison Statistics:
+Average SHAP difference: {avg_diff:.4f}
+Standard deviation: {std_diff:.4f}
+Max difference: {max_diff:.4f} (Seq2 more human-like)
+Min difference: {min_diff:.4f} (Seq1 more human-like)
+Fraction of positions with substantial differences: {frac_different:.2%}
+Interpretation:
+Positive values (red) indicate regions where Sequence 2 is more "human-like"
+Negative values (blue) indicate regions where Sequence 1 is more "human-like"
+"""
+    # Create comparison heatmap
+    heatmap_fig = plot_comparative_heatmap(shap_diff)
+    heatmap_img = fig_to_image(heatmap_fig)
+    # Create histogram of differences
+    hist_fig = plot_shap_histogram(
+        shap_diff,
+        title="Distribution of SHAP Differences"
+    )
+    hist_img = fig_to_image(hist_fig)
+    return comparison_text, heatmap_img, hist_img
+###############################################################################
+# NEW TAB TO GRADIO
+###############################################################################
+# Inside the Gradio interface definition, add this new tab:
+    with gr.Tab("3) Comparative Analysis"):
+        gr.Markdown("""
+        **Compare Two Sequences**
+        Upload or paste two FASTA sequences to compare their SHAP patterns.
+        The sequences will be normalized to the same length for comparison.
+        **Color Scale**:
+        - Red: Sequence 2 is more human-like in this region
+        - Blue: Sequence 1 is more human-like in this region
+        - White: No substantial difference
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_input1 = gr.File(
+                    label="Upload first FASTA file",
+                    file_types=[".fasta", ".fa", ".txt"],
+                    type="filepath"
+                )
+                text_input1 = gr.Textbox(
+                    label="Or paste first FASTA sequence",
+                    placeholder=">sequence1\nACGTACGT...",
+                    lines=5
+                )
+            with gr.Column(scale=1):
+                file_input2 = gr.File(
+                    label="Upload second FASTA file",
+                    file_types=[".fasta", ".fa", ".txt"],
+                    type="filepath"
+                )
+                text_input2 = gr.Textbox(
+                    label="Or paste second FASTA sequence",
+                    placeholder=">sequence2\nACGTACGT...",
+                    lines=5
+                )
+        compare_btn = gr.Button("Compare Sequences", variant="primary")
+        comparison_text = gr.Textbox(
+            label="Comparison Results",
+            lines=12,
+            interactive=False
+        )
+        with gr.Row():
+            diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
+            diff_hist = gr.Image(label="Distribution of SHAP Differences")
+        compare_btn.click(
+            analyze_sequence_comparison,
+            inputs=[file_input1, file_input2, text_input1, text_input2],
+            outputs=[comparison_text, diff_heatmap, diff_hist]
+        )
 ###############################################################################
 # 9. BUILD GRADIO INTERFACE
 ###############################################################################